001 package org.maltparser.parser.guide.instance;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.IOException;
006 import java.util.SortedMap;
007
008 import java.util.ArrayList;
009 import java.util.TreeMap;
010 import java.util.TreeSet;
011 import java.util.regex.Pattern;
012
013 import org.maltparser.core.exception.MaltChainedException;
014 import org.maltparser.core.feature.FeatureException;
015 import org.maltparser.core.feature.FeatureVector;
016 import org.maltparser.core.feature.function.FeatureFunction;
017 import org.maltparser.core.feature.function.Modifiable;
018 import org.maltparser.core.feature.value.SingleFeatureValue;
019 import org.maltparser.core.syntaxgraph.DependencyStructure;
020 import org.maltparser.parser.guide.ClassifierGuide;
021 import org.maltparser.parser.guide.GuideException;
022 import org.maltparser.parser.guide.Model;
023 import org.maltparser.parser.history.action.SingleDecision;
024
025 /**
026 The feature divide model is used for divide the training instances into several models according to
027 a divide feature. Usually this strategy decrease the training and classification time, but can also decrease
028 the accuracy of the parser.
029
030 @author Johan Hall
031 @since 1.0
032 */
033 public class FeatureDivideModel implements InstanceModel {
034 private Model parent;
035 private final SortedMap<Integer,AtomicModel> divideModels;
036 private FeatureVector masterFeatureVector;
037 private FeatureVector divideFeatureVector;
038 private int frequency = 0;
039 private FeatureFunction divideFeature;
040 private int divideThreshold;
041 private AtomicModel masterModel;
042 private ArrayList<Integer> divideFeatureIndexVector;
043
044 /**
045 * Constructs a feature divide model.
046 *
047 * @param features the feature vector used by the atomic model.
048 * @param parent the parent guide model.
049 * @throws MaltChainedException
050 */
051 public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException {
052 setParent(parent);
053 setFrequency(0);
054 initSplitParam(features);
055 divideModels = new TreeMap<Integer,AtomicModel>();
056 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
057 masterModel = new AtomicModel(-1, masterFeatureVector, this);
058 } else if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) {
059 load();
060 }
061 }
062
063 public void addInstance(SingleDecision decision) throws MaltChainedException {
064 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) {
065 throw new GuideException("Can only add instance during learning. ");
066 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
067 throw new GuideException("The divide feature does not have a single value. ");
068 }
069
070 divideFeature.update();
071 if (divideModels != null) {
072 if (!divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
073 divideModels.put(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), new AtomicModel(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), divideFeatureVector, this));
074 }
075 divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).addInstance(decision);
076 } else {
077 throw new GuideException("The feature divide models cannot be found. ");
078 }
079 }
080
081 public void noMoreInstances() throws MaltChainedException {
082 // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) {
083 // throw new GuideException("Can only finish all data during learning. ");
084 // }
085
086 if (divideModels != null) {
087 divideFeature.updateCardinality();
088 for (Integer index : divideModels.keySet()) {
089 divideModels.get(index).noMoreInstances();
090 }
091 final TreeSet<Integer> removeSet = new TreeSet<Integer>();
092 for (Integer index : divideModels.keySet()) {
093 if (divideModels.get(index).getFrequency() <= divideThreshold) {
094 divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector);
095 removeSet.add(index);
096 }
097 }
098 for (Integer index : removeSet) {
099 divideModels.remove(index);
100 }
101 masterModel.noMoreInstances();
102
103 } else {
104 throw new GuideException("The feature divide models cannot be found. ");
105 }
106 }
107
108 public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException {
109 // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) {
110 // throw new GuideException("Can only finish sentence during learning. ");
111 // }
112
113 if (divideModels != null) {
114 for (AtomicModel divideModel : divideModels.values()) {
115 divideModel.finalizeSentence(dependencyGraph);
116 }
117 } else {
118 throw new GuideException("The feature divide models cannot be found. ");
119 }
120 }
121
122 public boolean predict(SingleDecision decision) throws MaltChainedException {
123 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
124 throw new GuideException("Can only predict during parsing. ");
125 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
126 throw new GuideException("The divide feature does not have a single value. ");
127 }
128
129 //divideFeature.update();
130 if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
131 return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).predict(decision);
132 } else if (masterModel != null && masterModel.getFrequency() > 0) {
133 return masterModel.predict(decision);
134 } else {
135 getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " +
136 "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" +
137 " class code '1' is used. ");
138
139 decision.addDecision(1); // default prediction
140 //classCodeTable.getEmptyKBestList().addKBestItem(1);
141 }
142 return true;
143 }
144
145 public FeatureVector predictExtract(SingleDecision decision) throws MaltChainedException {
146 return getAtomicModel().predictExtract(decision);
147 }
148
149 public FeatureVector extract() throws MaltChainedException {
150 return getAtomicModel().extract();
151 }
152
153 private AtomicModel getAtomicModel() throws MaltChainedException {
154 if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
155 throw new GuideException("Can only predict during parsing. ");
156 } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
157 throw new GuideException("The divide feature does not have a single value. ");
158 }
159
160 if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
161 return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode());
162 } else if (masterModel != null && masterModel.getFrequency() > 0) {
163 return masterModel;
164 } else {
165 getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " +
166 "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" +
167 " class code '1' is used. ");
168 }
169 return null;
170 }
171
172 public void terminate() throws MaltChainedException {
173 if (divideModels != null) {
174 for (AtomicModel divideModel : divideModels.values()) {
175 divideModel.terminate();
176 }
177 }
178 if (masterModel != null) {
179 masterModel.terminate();
180 }
181 }
182
183 public void train() throws MaltChainedException {
184 for (AtomicModel divideModel : divideModels.values()) {
185 divideModel.train();
186 }
187 masterModel.train();
188 save();
189 for (AtomicModel divideModel : divideModels.values()) {
190 divideModel.terminate();
191 }
192 masterModel.terminate();
193 }
194
195 /**
196 * Initialize the feature split parameters and the split feature vector and master feature vector
197 * according to the behavior strategy.
198 *
199 * @param featureVector the parent guide model's feature vector.
200 * @throws MaltChainedException
201 */
202 protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException {
203 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null
204 || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) {
205 throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. ");
206 }
207 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null
208 || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) {
209 throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. ");
210 }
211 try {
212 final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim()+
213 ", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() +")";
214 divideFeature = featureVector.getFeatureModel().identifyFeature(spec);
215 } catch (FeatureException e) {
216 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") cannot be initialized. ", e);
217 }
218 if (!(divideFeature instanceof Modifiable)) {
219 throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") does not implement Modifiable interface. ");
220 }
221 divideFeatureIndexVector = new ArrayList<Integer>();
222 for (int i = 0; i < featureVector.size(); i++) {
223 if (featureVector.get(i).equals(divideFeature)) {
224 divideFeatureIndexVector.add(i);
225 }
226 }
227
228 // if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) {
229 // /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or
230 // * master model adds the divide feature in the end of the feature vector.
231 // */
232 // masterFeatureVector = (FeatureVector)featureVector.clone();
233 // for (Integer i : divideFeatureIndexVector) {
234 // masterFeatureVector.remove(masterFeatureVector.get(i));
235 // }
236 // for (Integer i : divideFeatureIndexVector) {
237 // masterFeatureVector.add(featureVector.get(i));
238 // }
239 //
240 // divideFeatureVector = (FeatureVector)featureVector.clone();
241 // for (Integer i : divideFeatureIndexVector) {
242 // divideFeatureVector.remove(divideFeatureVector.get(i));
243 // }
244 // } else {
245 masterFeatureVector = featureVector;
246 divideFeatureVector = (FeatureVector)featureVector.clone();
247 for (Integer i : divideFeatureIndexVector) {
248 divideFeatureVector.remove(divideFeatureVector.get(i));
249 }
250 // }
251 try {
252 if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) {
253 divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString());
254 } else {
255 divideThreshold = 0;
256 }
257 } catch (NumberFormatException e) {
258 throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e);
259 }
260 }
261
262 /**
263 * Saves the feature divide model settings .fsm file.
264 *
265 * @throws MaltChainedException
266 */
267 protected void save() throws MaltChainedException {
268 try {
269 final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName()+".dsm"));
270 out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n");
271
272 if (divideModels != null) {
273 for (AtomicModel divideModel : divideModels.values()) {
274 out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n");
275 }
276 }
277 out.close();
278 } catch (IOException e) {
279 throw new GuideException("Could not write to the guide model settings file '"+getModelName()+".dsm"+"', when " +
280 "saving the guide model settings to file. ", e);
281 }
282 }
283
284 /**
285 * Loads the feature divide model settings .fsm file.
286 *
287 * @throws MaltChainedException
288 */
289 protected void load() throws MaltChainedException {
290 try {
291 final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(getModelName()+".dsm"));
292 final Pattern tabPattern = Pattern.compile("\t");
293 while(true) {
294 String line = in.readLine();
295 if(line == null) break;
296 String[] cols = tabPattern.split(line);
297 if (cols.length != 2) {
298 throw new GuideException("");
299 }
300 int code = -1;
301 int freq = 0;
302 try {
303 code = Integer.parseInt(cols[0]);
304 freq = Integer.parseInt(cols[1]);
305 } catch (NumberFormatException e) {
306 throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e);
307 }
308 if (code == -1) {
309 masterModel = new AtomicModel(-1, masterFeatureVector, this);
310 masterModel.setFrequency(freq);
311 } else if (divideModels != null) {
312 divideModels.put(code, new AtomicModel(code, divideFeatureVector, this));
313 divideModels.get(code).setFrequency(freq);
314 }
315 setFrequency(getFrequency()+freq);
316 }
317 in.close();
318 } catch (IOException e) {
319 throw new GuideException("Could not read from the guide model settings file '"+getModelName()+".dsm"+"', when " +
320 "loading the guide model settings. ", e);
321 }
322 }
323
324 /**
325 * Returns the parent model
326 *
327 * @return the parent model
328 */
329 public Model getParent() {
330 return parent;
331 }
332
333 public ClassifierGuide getGuide() {
334 return parent.getGuide();
335 }
336
337 /**
338 * Sets the parent model
339 *
340 * @param parent the parent model
341 */
342 protected void setParent(Model parent) throws MaltChainedException {
343 this.parent = parent;
344 }
345
346
347 public String getModelName() throws MaltChainedException {
348 try {
349 return parent.getModelName();
350 } catch (NullPointerException e) {
351 throw new GuideException("The parent guide model cannot be found. ", e);
352 }
353 }
354
355 /**
356 * Returns the "sum-up" or master feature vector
357 *
358 * @return a feature vector object
359 */
360 public FeatureVector getMasterFeatureVector() {
361 return masterFeatureVector;
362 }
363
364 /**
365 * Returns the divide feature vector
366 *
367 * @return a feature vector object
368 */
369 public FeatureVector getDivideFeatureVector() {
370 return divideFeatureVector;
371 }
372
373 /**
374 * Returns the frequency (number of instances)
375 *
376 * @return the frequency (number of instances)
377 */
378 public int getFrequency() {
379 return frequency;
380 }
381
382 /**
383 * Increase the frequency by 1
384 */
385 public void increaseFrequency() {
386 if (parent instanceof InstanceModel) {
387 ((InstanceModel)parent).increaseFrequency();
388 }
389 frequency++;
390 }
391
392 public void decreaseFrequency() {
393 if (parent instanceof InstanceModel) {
394 ((InstanceModel)parent).decreaseFrequency();
395 }
396 frequency--;
397 }
398
399 /**
400 * Sets the frequency (number of instances)
401 *
402 * @param frequency (number of instances)
403 */
404 protected void setFrequency(int frequency) {
405 this.frequency = frequency;
406 }
407
408
409 /* (non-Javadoc)
410 * @see java.lang.Object#toString()
411 */
412 public String toString() {
413 final StringBuilder sb = new StringBuilder();
414 //TODO
415 return sb.toString();
416 }
417 }