001 package org.maltparser.ml.liblinear;
002
003 import java.io.BufferedReader;
004 import java.io.BufferedWriter;
005 import java.io.File;
006 import java.io.FileNotFoundException;
007 import java.io.IOException;
008 import java.io.InputStream;
009 import java.io.InputStreamReader;
010 import java.io.OutputStreamWriter;
011 import java.io.PrintStream;
012 import java.util.ArrayList;
013 import java.util.HashMap;
014 import java.util.LinkedHashMap;
015 import java.util.Map;
016 import java.util.Set;
017 import java.util.jar.JarEntry;
018 import java.util.regex.Pattern;
019 import java.util.regex.PatternSyntaxException;
020
021 import liblinear.FeatureNode;
022 import liblinear.Linear;
023 import liblinear.Model;
024 import liblinear.Parameter;
025 import liblinear.Problem;
026 import liblinear.SolverType;
027
028
029
030 import org.maltparser.core.exception.MaltChainedException;
031 import org.maltparser.core.feature.FeatureVector;
032 import org.maltparser.core.feature.function.FeatureFunction;
033 import org.maltparser.core.feature.value.FeatureValue;
034 import org.maltparser.core.feature.value.MultipleFeatureValue;
035 import org.maltparser.core.feature.value.SingleFeatureValue;
036 import org.maltparser.core.helper.NoPrintStream;
037 import org.maltparser.core.syntaxgraph.DependencyStructure;
038 import org.maltparser.ml.LearningMethod;
039 import org.maltparser.parser.DependencyParserConfig;
040 import org.maltparser.parser.guide.instance.InstanceModel;
041 import org.maltparser.parser.history.action.SingleDecision;
042 import org.maltparser.parser.history.kbest.KBestList;
043 import org.maltparser.parser.history.kbest.ScoredKBestList;
044
045
046 public class Liblinear implements LearningMethod {
047 public final static String LIBLINEAR_VERSION = "1.51";
048 public enum Verbostity {
049 SILENT, ERROR, ALL
050 }
051 private LinkedHashMap<String, String> liblinearOptions;
052
053 protected InstanceModel owner;
054 protected int learnerMode;
055 protected String name;
056 protected int numberOfInstances;
057 protected boolean saveInstanceFiles;
058 protected boolean excludeNullValues;
059 protected String pathExternalLiblinearTrain = null;
060 private int[] cardinalities;
061 /**
062 * Instance output stream writer
063 */
064 private BufferedWriter instanceOutput = null;
065 /**
066 * Liblinear model object, only used during classification.
067 */
068 private Model model = null;
069
070 /**
071 * Parameter string
072 */
073 private String paramString;
074
075 private ArrayList<FeatureNode> xlist = null;
076
077 private Verbostity verbosity;
078 /**
079 * Constructs a Liblinear learner.
080 *
081 * @param owner the guide model owner
082 * @param learnerMode the mode of the learner TRAIN or CLASSIFY
083 */
084 public Liblinear(InstanceModel owner, Integer learnerMode) throws MaltChainedException {
085 setOwner(owner);
086 setLearningMethodName("liblinear");
087 setLearnerMode(learnerMode.intValue());
088 setNumberOfInstances(0);
089 verbosity = Verbostity.SILENT;
090
091 liblinearOptions = new LinkedHashMap<String, String>();
092 initLiblinearOptions();
093 parseParameters(getConfiguration().getOptionValue("liblinear", "liblinear_options").toString());
094 initSpecialParameters();
095 if (learnerMode == BATCH) {
096 // if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
097 // if (pathExternalLiblinearTrain != null) {
098 // owner.getGuide().getConfiguration().getConfigLogger().info(" Learner : Liblinear external "+ getLibLinearOptions() + "\n");
099 // } else {
100 // owner.getGuide().getConfiguration().getConfigLogger().info(" Learner : Liblinear "+LIBLINEAR_VERSION+" "+ getLibLinearOptions() + "\n");
101 // }
102 // }
103 instanceOutput = new BufferedWriter(getInstanceOutputStreamWriter(".ins"));
104 }
105 // else {
106 // if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
107 // owner.getGuide().getConfiguration().getConfigLogger().info(" Classifier : Liblinear "+LIBLINEAR_VERSION+" "+ getLibLinearOptions()+ "\n");
108 // }
109 // }
110 }
111
112
113 public void addInstance(SingleDecision decision, FeatureVector featureVector) throws MaltChainedException {
114 if (featureVector == null) {
115 throw new LiblinearException("The feature vector cannot be found");
116 } else if (decision == null) {
117 throw new LiblinearException("The decision cannot be found");
118 }
119 try {
120 instanceOutput.write(decision.getDecisionCode()+"\t");
121 for (int i = 0; i < featureVector.size(); i++) {
122 FeatureValue featureValue = featureVector.get(i).getFeatureValue();
123 if (excludeNullValues == true && featureValue.isNullValue()) {
124 instanceOutput.write("-1");
125 } else {
126 if (featureValue instanceof SingleFeatureValue) {
127 instanceOutput.write(((SingleFeatureValue)featureValue).getCode()+"");
128 } else if (featureValue instanceof MultipleFeatureValue) {
129 Set<Integer> values = ((MultipleFeatureValue)featureValue).getCodes();
130 int j=0;
131 for (Integer value : values) {
132 instanceOutput.write(value.toString());
133 if (j != values.size()-1) {
134 instanceOutput.write("|");
135 }
136 j++;
137 }
138 }
139 }
140 if (i != featureVector.size()) {
141 instanceOutput.write('\t');
142 }
143 }
144
145 instanceOutput.write('\n');
146 instanceOutput.flush();
147 increaseNumberOfInstances();
148 } catch (IOException e) {
149 throw new LiblinearException("The Liblinear learner cannot write to the instance file. ", e);
150 }
151 }
152
153 public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { }
154
155 /* (non-Javadoc)
156 * @see org.maltparser.ml.LearningMethod#noMoreInstances()
157 */
158 public void noMoreInstances() throws MaltChainedException {
159 closeInstanceWriter();
160 }
161
162
163 /* (non-Javadoc)
164 * @see org.maltparser.ml.LearningMethod#train(org.maltparser.parser.guide.feature.FeatureVector)
165 */
166 public void train(FeatureVector featureVector) throws MaltChainedException {
167 if (featureVector == null) {
168 throw new LiblinearException("The feature vector cannot be found. ");
169 } else if (owner == null) {
170 throw new LiblinearException("The parent guide model cannot be found. ");
171 }
172 cardinalities = getCardinalities(featureVector);
173 if (pathExternalLiblinearTrain == null) {
174 try {
175 final Problem problem = readLibLinearProblem(getInstanceInputStreamReader(".ins"), cardinalities);
176 if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
177 owner.getGuide().getConfiguration().getConfigLogger().info("Creating Liblinear model "+getFile(".mod").getName()+"\n");
178 }
179 final PrintStream out = System.out;
180 final PrintStream err = System.err;
181 System.setOut(NoPrintStream.NO_PRINTSTREAM);
182 System.setErr(NoPrintStream.NO_PRINTSTREAM);
183 Linear.saveModel(new File(getFile(".mod").getAbsolutePath()), Linear.train(problem, getLiblinearParameters()));
184 System.setOut(err);
185 System.setOut(out);
186 if (!saveInstanceFiles) {
187 getFile(".ins").delete();
188 }
189 } catch (OutOfMemoryError e) {
190 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
191 } catch (IllegalArgumentException e) {
192 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e);
193 } catch (SecurityException e) {
194 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e);
195 } catch (IOException e) {
196 throw new LiblinearException("The Liblinear learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e);
197 }
198 } else {
199 trainExternal(featureVector);
200 }
201 saveCardinalities(getInstanceOutputStreamWriter(".car"), cardinalities);
202 }
203
204 @Override
205 public double crossValidate(FeatureVector featureVector, int nrOfSplits)
206 throws MaltChainedException {
207 if (featureVector == null) {
208 throw new LiblinearException("The feature vector cannot be found. ");
209 } else if (owner == null) {
210 throw new LiblinearException("The parent guide model cannot be found. ");
211 }
212
213 cardinalities = getCardinalities(featureVector);
214
215 double crossValidationAccuracy = 0.0;
216
217 //if (pathExternalLiblinearTrain == null) {
218 try {
219 final Problem problem = readLibLinearProblem(getInstanceInputStreamReader(".ins"), cardinalities);
220 if (owner.getGuide().getConfiguration().getConfigLogger().isInfoEnabled()) {
221 owner.getGuide().getConfiguration().getConfigLogger().info("Doing cross validation for model "+ owner.getModelName() + "\n");
222 }
223 final PrintStream out = System.out;
224 final PrintStream err = System.err;
225 System.setOut(NoPrintStream.NO_PRINTSTREAM);
226 System.setErr(NoPrintStream.NO_PRINTSTREAM);
227
228 int[] target = new int[problem.l];
229
230 Linear.crossValidation(problem, getLiblinearParameters(), nrOfSplits, target);
231
232 double totalCorrect = 0;
233 for (int i = 0; i < problem.l; i++)
234 if (target[i] == problem.y[i]) ++totalCorrect;
235
236 if(totalCorrect>0)
237 crossValidationAccuracy = 100.0 * totalCorrect / problem.l;
238
239 System.setOut(err);
240 System.setOut(out);
241 //Don't delete the instance file here
242 //if (!saveInstanceFiles) {
243 // getFile(".ins").delete();
244 //}
245 } catch (OutOfMemoryError e) {
246 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
247 } catch (IllegalArgumentException e) {
248 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e);
249 } catch (SecurityException e) {
250 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e);
251 }
252 //} else {
253 // trainExternal(featureVector);
254 //}
255
256 return crossValidationAccuracy;
257 }
258
259 private void trainExternal(FeatureVector featureVector) throws MaltChainedException {
260 try {
261 maltSVMFormat2OriginalSVMFormat(getInstanceInputStreamReader(".ins"), getInstanceOutputStreamWriter(".ins.tmp"), cardinalities);
262 owner.getGuide().getConfiguration().getConfigLogger().info("Creating Liblinear model (external) "+getFile(".mod").getName());
263
264 final String[] params = getLibLinearParamStringArray();
265 String[] arrayCommands = new String[params.length+3];
266 int i = 0;
267 arrayCommands[i++] = pathExternalLiblinearTrain;
268 for (; i <= params.length; i++) {
269 arrayCommands[i] = params[i-1];
270 }
271 arrayCommands[i++] = getFile(".ins.tmp").getAbsolutePath();
272 arrayCommands[i++] = getFile(".mod").getAbsolutePath();
273
274 if (verbosity == Verbostity.ALL) {
275 owner.getGuide().getConfiguration().getConfigLogger().info('\n');
276 }
277 final Process child = Runtime.getRuntime().exec(arrayCommands);
278 final InputStream in = child.getInputStream();
279 final InputStream err = child.getErrorStream();
280 int c;
281 while ((c = in.read()) != -1){
282 if (verbosity == Verbostity.ALL) {
283 owner.getGuide().getConfiguration().getConfigLogger().info((char)c);
284 }
285 }
286 while ((c = err.read()) != -1){
287 if (verbosity == Verbostity.ALL || verbosity == Verbostity.ERROR) {
288 owner.getGuide().getConfiguration().getConfigLogger().info((char)c);
289 }
290 }
291 if (child.waitFor() != 0) {
292 owner.getGuide().getConfiguration().getConfigLogger().info(" FAILED ("+child.exitValue()+")");
293 }
294 in.close();
295 err.close();
296 if (!saveInstanceFiles) {
297 getFile(".ins").delete();
298 getFile(".ins.tmp").delete();
299 }
300 owner.getGuide().getConfiguration().getConfigLogger().info('\n');
301 } catch (InterruptedException e) {
302 throw new LiblinearException("Liblinear is interrupted. ", e);
303 } catch (IllegalArgumentException e) {
304 throw new LiblinearException("The Liblinear learner was not able to redirect Standard Error stream. ", e);
305 } catch (SecurityException e) {
306 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e);
307 } catch (IOException e) {
308 throw new LiblinearException("The Liblinear learner cannot save the model file '"+getFile(".mod").getAbsolutePath()+"'. ", e);
309 } catch (OutOfMemoryError e) {
310 throw new LiblinearException("Out of memory. Please increase the Java heap size (-Xmx<size>). ", e);
311 }
312 }
313
314 private int[] getCardinalities(FeatureVector featureVector) {
315 int[] cardinalities = new int[featureVector.size()];
316 int i = 0;
317 for (FeatureFunction feature : featureVector) {
318 cardinalities[i++] = feature.getFeatureValue().getCardinality();
319 }
320 return cardinalities;
321 }
322
323 private void saveCardinalities(OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException {
324 final BufferedWriter out = new BufferedWriter(osw);
325 try {
326 for (int i = 0, n = cardinalities.length; i < n; i++) {
327 out.write(Integer.toString(cardinalities[i]));
328 if (i < n - 1) {
329 out.write(',');
330 }
331 }
332 out.write('\n');
333 out.close();
334 } catch (IOException e) {
335 throw new LiblinearException("", e);
336 }
337 }
338
339 private int[] loadCardinalities(InputStreamReader isr) throws MaltChainedException {
340 int[] cardinalities = null;
341 try {
342 final BufferedReader in = new BufferedReader(isr);
343 String line;
344 if ((line = in.readLine()) != null) {
345 String[] items = line.split(",");
346 cardinalities = new int[items.length];
347 for (int i = 0; i < items.length; i++) {
348 cardinalities[i] = Integer.parseInt(items[i]);
349 }
350 }
351 in.close();
352 } catch (IOException e) {
353 throw new LiblinearException("", e);
354 } catch (NumberFormatException e) {
355 throw new LiblinearException("", e);
356 }
357 return cardinalities;
358 }
359
360 /* (non-Javadoc)
361 * @see org.maltparser.ml.LearningMethod#moveAllInstances(org.maltparser.ml.LearningMethod, org.maltparser.core.feature.function.FeatureFunction, java.util.ArrayList)
362 */
363 public void moveAllInstances(LearningMethod method, FeatureFunction divideFeature, ArrayList<Integer> divideFeatureIndexVector) throws MaltChainedException {
364 if (method == null) {
365 throw new LiblinearException("The learning method cannot be found. ");
366 } else if (divideFeature == null) {
367 throw new LiblinearException("The divide feature cannot be found. ");
368 }
369
370 try {
371 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins"));
372 final BufferedWriter out = method.getInstanceWriter();
373 final StringBuilder sb = new StringBuilder(6);
374 int l = in.read();
375 char c;
376 int j = 0;
377
378 while(true) {
379 if (l == -1) {
380 sb.setLength(0);
381 break;
382 }
383 c = (char)l;
384 l = in.read();
385 if (c == '\t') {
386 if (divideFeatureIndexVector.contains(j-1)) {
387 out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()));
388 out.write('\t');
389 }
390 out.write(sb.toString());
391 j++;
392 out.write('\t');
393 sb.setLength(0);
394 } else if (c == '\n') {
395 out.write(sb.toString());
396 if (divideFeatureIndexVector.contains(j-1)) {
397 out.write('\t');
398 out.write(Integer.toString(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()));
399 }
400 out.write('\n');
401 sb.setLength(0);
402 method.increaseNumberOfInstances();
403 this.decreaseNumberOfInstances();
404 j = 0;
405 } else {
406 sb.append(c);
407 }
408 }
409 in.close();
410 getFile(".ins").delete();
411 out.flush();
412 } catch (SecurityException e) {
413 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e);
414 } catch (NullPointerException e) {
415 throw new LiblinearException("The instance file cannot be found. ", e);
416 } catch (FileNotFoundException e) {
417 throw new LiblinearException("The instance file cannot be found. ", e);
418 } catch (IOException e) {
419 throw new LiblinearException("The Liblinear learner read from the instance file. ", e);
420 }
421
422 }
423
424 /* (non-Javadoc)
425 * @see org.maltparser.ml.LearningMethod#predict(org.maltparser.parser.guide.feature.FeatureVector, org.maltparser.ml.KBestList)
426 */
427 public boolean predict(FeatureVector featureVector, SingleDecision decision) throws MaltChainedException {
428
429 if (model == null) {
430 try {
431 model = Linear.loadModel(new BufferedReader(getInstanceInputStreamReaderFromConfigFile(".mod")));
432 } catch (IOException e) {
433 throw new LiblinearException("The model cannot be loaded. ", e);
434 }
435 }
436
437 if (cardinalities == null) {
438 if (getConfigFileEntry(".car") != null) {
439 cardinalities = loadCardinalities(getInstanceInputStreamReaderFromConfigFile(".car"));
440 } else {
441 cardinalities = getCardinalities(featureVector);
442 }
443 }
444 //System.out.println("METHOD PREDICT CARDINALITIES SIZE" + cardinalities.length + " FEATURE VECTOR SIZE " +featureVector.size());
445 if (xlist == null) {
446 xlist = new ArrayList<FeatureNode>(featureVector.size());
447 }
448 if (model == null) {
449 throw new LiblinearException("The Liblinear learner cannot predict the next class, because the learning model cannot be found. ");
450 } else if (featureVector == null) {
451 throw new LiblinearException("The Liblinear learner cannot predict the next class, because the feature vector cannot be found. ");
452 }
453 int j = 0;
454 int offset = 1;
455 int i = 0;
456 for (FeatureFunction feature : featureVector) {
457 final FeatureValue featureValue = feature.getFeatureValue();
458 if (!(excludeNullValues == true && featureValue.isNullValue())) {
459 if (featureValue instanceof SingleFeatureValue) {
460 if (((SingleFeatureValue)featureValue).getCode() < cardinalities[i]) {
461 xlist.add(j++, new FeatureNode(((SingleFeatureValue)featureValue).getCode() + offset, 1));
462 }
463 } else if (featureValue instanceof MultipleFeatureValue) {
464 for (Integer value : ((MultipleFeatureValue)featureValue).getCodes()) {
465 if (value < cardinalities[i]) {
466 xlist.add(j++, new FeatureNode(value + offset, 1));
467 }
468 }
469 }
470 }
471 offset += cardinalities[i];
472 i++;
473 }
474
475 FeatureNode[] xarray = new FeatureNode[j];
476 for (int k = 0; k < j; k++) {
477 xarray[k] = xlist.get(k);
478 }
479
480 if (decision.getKBestList().getK() == 1) {
481 decision.getKBestList().add(Linear.predict(model, xarray));
482 } else {
483 liblinear_predict_with_kbestlist(model, xarray, decision.getKBestList());
484 }
485
486 xlist.clear();
487
488 return true;
489 }
490
491
492 public void terminate() throws MaltChainedException {
493 closeInstanceWriter();
494 model = null;
495 xlist = null;
496 owner = null;
497 }
498
499 public BufferedWriter getInstanceWriter() {
500 return instanceOutput;
501 }
502
503 protected void closeInstanceWriter() throws MaltChainedException {
504 try {
505 if (instanceOutput != null) {
506 instanceOutput.flush();
507 instanceOutput.close();
508 instanceOutput = null;
509 }
510 } catch (IOException e) {
511 throw new LiblinearException("The Liblinear learner cannot close the instance file. ", e);
512 }
513 }
514
515
516 /**
517 * Returns the parameter string for used for configure Liblinear
518 *
519 * @return the parameter string for used for configure Liblinear
520 */
521 public String getParamString() {
522 return paramString;
523 }
524
525 public InstanceModel getOwner() {
526 return owner;
527 }
528
529 protected void setOwner(InstanceModel owner) {
530 this.owner = owner;
531 }
532
533 public int getLearnerMode() {
534 return learnerMode;
535 }
536
537 public void setLearnerMode(int learnerMode) throws MaltChainedException {
538 this.learnerMode = learnerMode;
539 }
540
541 public String getLearningMethodName() {
542 return name;
543 }
544
545 /**
546 * Returns the current configuration
547 *
548 * @return the current configuration
549 * @throws MaltChainedException
550 */
551 public DependencyParserConfig getConfiguration() throws MaltChainedException {
552 return owner.getGuide().getConfiguration();
553 }
554
555 public int getNumberOfInstances() throws MaltChainedException {
556 if(numberOfInstances!=0)
557 return numberOfInstances;
558 else{
559 //Do a line count of the instance file and return that
560
561 BufferedReader reader = new BufferedReader( getInstanceInputStreamReader(".ins"));
562 try {
563 while(reader.readLine()!=null){
564 numberOfInstances++;
565 owner.increaseFrequency();
566 }
567
568 reader.close();
569 } catch (IOException e) {
570 throw new MaltChainedException("No instances found in file",e);
571 }
572
573
574
575 return numberOfInstances;
576
577 }
578 }
579
580 public void increaseNumberOfInstances() {
581 numberOfInstances++;
582 owner.increaseFrequency();
583 }
584
585 public void decreaseNumberOfInstances() {
586 numberOfInstances--;
587 owner.decreaseFrequency();
588 }
589
590 protected void setNumberOfInstances(int numberOfInstances) {
591 this.numberOfInstances = 0;
592 }
593
594 protected void setLearningMethodName(String name) {
595 this.name = name;
596 }
597
598 protected OutputStreamWriter getInstanceOutputStreamWriter(String suffix) throws MaltChainedException {
599 return getConfiguration().getConfigurationDir().getAppendOutputStreamWriter(owner.getModelName()+getLearningMethodName()+suffix);
600 }
601
602 protected InputStreamReader getInstanceInputStreamReader(String suffix) throws MaltChainedException {
603 return getConfiguration().getConfigurationDir().getInputStreamReader(owner.getModelName()+getLearningMethodName()+suffix);
604 }
605
606 protected InputStreamReader getInstanceInputStreamReaderFromConfigFile(String suffix) throws MaltChainedException {
607 return getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(owner.getModelName()+getLearningMethodName()+suffix);
608 }
609
610 protected File getFile(String suffix) throws MaltChainedException {
611 return getConfiguration().getConfigurationDir().getFile(owner.getModelName()+getLearningMethodName()+suffix);
612 }
613
614 protected JarEntry getConfigFileEntry(String suffix) throws MaltChainedException {
615 return getConfiguration().getConfigurationDir().getConfigFileEntry(owner.getModelName()+getLearningMethodName()+suffix);
616 }
617 /**
618 * Reads an instance file into a svm_problem object according to the Malt-SVM format, which is column fixed format (tab-separated).
619 *
620 * @param isr the instance stream reader for the instance file
621 * @param cardinalities a array containing the number of distinct values for a particular column.
622 * @throws LiblinearException
623 */
624 public Problem readLibLinearProblem(InputStreamReader isr, int[] cardinalities) throws MaltChainedException {
625 Problem problem = new Problem();
626
627
628
629 try {
630 final BufferedReader fp = new BufferedReader(isr);
631 int max_index = 0;
632 if (xlist == null) {
633 xlist = new ArrayList<FeatureNode>();
634 }
635 problem.bias = getBias();
636 problem.l = getNumberOfInstances();
637 problem.x = new FeatureNode[problem.l][];
638 problem.y = new int[problem.l];
639 int i = 0;
640 final Pattern tabPattern = Pattern.compile("\t");
641 final Pattern pipePattern = Pattern.compile("\\|");
642 while(true) {
643 String line = fp.readLine();
644
645 if(line == null) break;
646 String[] columns = tabPattern.split(line);
647
648 if (columns.length == 0) {
649 continue;
650 }
651
652 int offset = 1;
653 int j = 0;
654 try {
655 problem.y[i] =
656 Integer.parseInt(columns[j]);
657 int p = 0;
658 for(j = 1; j < columns.length; j++) {
659 final String[] items = pipePattern.split(columns[j]);
660 for (int k = 0; k < items.length; k++) {
661 try {
662 if (Integer.parseInt(items[k]) != -1) {
663 xlist.add(p, new FeatureNode(Integer.parseInt(items[k])+offset, 1));
664 p++;
665 }
666 } catch (NumberFormatException e) {
667 throw new LiblinearException("The instance file contain a non-integer value '"+items[k]+"'", e);
668 }
669 }
670 offset += cardinalities[j-1];
671 }
672 problem.x[i] = xlist.subList(0, p).toArray(new FeatureNode[0]);
673 if(columns.length > 1) {
674 max_index = Math.max(max_index, problem.x[i][p-1].index);
675 }
676 i++;
677 xlist.clear();
678 } catch (ArrayIndexOutOfBoundsException e) {
679 throw new LiblinearException("Cannot read from the instance file. ", e);
680 }
681 }
682 fp.close();
683 problem.n = max_index;
684 if ( problem.bias >= 0 ) {
685 problem.n++;
686 }
687 xlist = null;
688 } catch (IOException e) {
689 throw new LiblinearException("Cannot read from the instance file. ", e);
690 }
691 return problem;
692 }
693
694 protected void initSpecialParameters() throws MaltChainedException {
695 if (getConfiguration().getOptionValue("singlemalt", "null_value") != null && getConfiguration().getOptionValue("singlemalt", "null_value").toString().equalsIgnoreCase("none")) {
696 excludeNullValues = true;
697 } else {
698 excludeNullValues = false;
699 }
700 saveInstanceFiles = ((Boolean)getConfiguration().getOptionValue("liblinear", "save_instance_files")).booleanValue();
701
702 if (!getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().equals("")) {
703 try {
704 if (!new File(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString()).exists()) {
705 throw new LiblinearException("The path to the external Liblinear trainer 'svm-train' is wrong.");
706 }
707 if (new File(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString()).isDirectory()) {
708 throw new LiblinearException("The option --liblinear-liblinear_external points to a directory, the path should point at the 'train' file or the 'train.exe' file");
709 }
710 if (!(getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().endsWith("train") || getConfiguration().getOptionValue("liblinear", "liblinear_external").toString().endsWith("train.exe"))) {
711 throw new LiblinearException("The option --liblinear-liblinear_external does not specify the path to 'train' file or the 'train.exe' file. ");
712 }
713 pathExternalLiblinearTrain = getConfiguration().getOptionValue("liblinear", "liblinear_external").toString();
714 } catch (SecurityException e) {
715 throw new LiblinearException("Access denied to the file specified by the option --liblinear-liblinear_external. ", e);
716 }
717 }
718 if (getConfiguration().getOptionValue("liblinear", "verbosity") != null) {
719 verbosity = Verbostity.valueOf(getConfiguration().getOptionValue("liblinear", "verbosity").toString().toUpperCase());
720 }
721 }
722
723 public String getLibLinearOptions() {
724 StringBuilder sb = new StringBuilder();
725 for (String key : liblinearOptions.keySet()) {
726 sb.append('-');
727 sb.append(key);
728 sb.append(' ');
729 sb.append(liblinearOptions.get(key));
730 sb.append(' ');
731 }
732 return sb.toString();
733 }
734
735 public void parseParameters(String paramstring) throws MaltChainedException {
736 if (paramstring == null) {
737 return;
738 }
739 final String[] argv;
740 String allowedFlags = "sceB";
741 try {
742 argv = paramstring.split("[_\\p{Blank}]");
743 } catch (PatternSyntaxException e) {
744 throw new LiblinearException("Could not split the liblinear-parameter string '"+paramstring+"'. ", e);
745 }
746 for (int i=0; i < argv.length-1; i++) {
747 if(argv[i].charAt(0) != '-') {
748 throw new LiblinearException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
749 }
750 if(++i>=argv.length) {
751 throw new LiblinearException("The last argument does not have any value. ");
752 }
753 try {
754 int index = allowedFlags.indexOf(argv[i-1].charAt(1));
755 if (index != -1) {
756 liblinearOptions.put(Character.toString(argv[i-1].charAt(1)), argv[i]);
757 } else {
758 throw new LiblinearException("Unknown liblinear parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
759 }
760 } catch (ArrayIndexOutOfBoundsException e) {
761 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);
762 } catch (NumberFormatException e) {
763 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);
764 } catch (NullPointerException e) {
765 throw new LiblinearException("The liblinear parameter '"+argv[i-1]+"' could not convert the string value '"+argv[i]+"' into a correct numeric value. ", e);
766 }
767 }
768 }
769
770 public double getBias() throws MaltChainedException {
771 try {
772 return Double.valueOf(liblinearOptions.get("B")).doubleValue();
773 } catch (NumberFormatException e) {
774 throw new LiblinearException("The liblinear bias value is not numerical value. ", e);
775 }
776 }
777
778 public Parameter getLiblinearParameters() throws MaltChainedException {
779 Parameter param = new Parameter(SolverType.MCSVM_CS, 0.1, 0.1);
780 String type = liblinearOptions.get("s");
781
782 if (type.equals("0")) {
783 param.setSolverType(SolverType.L2R_LR);
784 } else if (type.equals("1")) {
785 param.setSolverType(SolverType.L2R_L2LOSS_SVC_DUAL);
786 } else if (type.equals("2")) {
787 param.setSolverType(SolverType.L2R_L2LOSS_SVC);
788 } else if (type.equals("3")) {
789 param.setSolverType(SolverType.L2R_L1LOSS_SVC_DUAL);
790 } else if (type.equals("4")) {
791 param.setSolverType(SolverType.MCSVM_CS);
792 } else if (type.equals("5")) {
793 param.setSolverType(SolverType.L1R_L2LOSS_SVC);
794 } else if (type.equals("6")) {
795 param.setSolverType(SolverType.L1R_LR);
796 } else {
797 throw new LiblinearException("The liblinear type (-s) is not an integer value between 0 and 4. ");
798 }
799 try {
800 param.setC(Double.valueOf(liblinearOptions.get("c")).doubleValue());
801 } catch (NumberFormatException e) {
802 throw new LiblinearException("The liblinear cost (-c) value is not numerical value. ", e);
803 }
804 try {
805 param.setEps(Double.valueOf(liblinearOptions.get("e")).doubleValue());
806 } catch (NumberFormatException e) {
807 throw new LiblinearException("The liblinear epsilon (-e) value is not numerical value. ", e);
808 }
809 return param;
810 }
811
812 public void initLiblinearOptions() {
813 liblinearOptions.put("s", "4"); // type = SolverType.L2LOSS_SVM_DUAL (default)
814 liblinearOptions.put("c", "0.1"); // cost = 1 (default)
815 liblinearOptions.put("e", "0.1"); // epsilon = 0.1 (default)
816 liblinearOptions.put("B", "1"); // bias = 1 (default)
817 }
818
819 public String[] getLibLinearParamStringArray() {
820 final ArrayList<String> params = new ArrayList<String>();
821
822 for (String key : liblinearOptions.keySet()) {
823 params.add("-"+key); params.add(liblinearOptions.get(key));
824 }
825 return params.toArray(new String[params.size()]);
826 }
827
828
829 public void liblinear_predict_with_kbestlist(Model model, FeatureNode[] x, KBestList kBestList) throws MaltChainedException {
830 int i;
831 final int nr_class = model.getNrClass();
832 final double[] dec_values = new double[nr_class];
833
834 Linear.predictValues(model, x, dec_values);
835 final int[] labels = model.getLabels();
836 int[] predictionList = new int[nr_class];
837 for(i=0;i<nr_class;i++) {
838 predictionList[i] = labels[i];
839 }
840
841 double tmpDec;
842 int tmpObj;
843 int lagest;
844 for (i=0;i<nr_class-1;i++) {
845 lagest = i;
846 for (int j=i;j<nr_class;j++) {
847 if (dec_values[j] > dec_values[lagest]) {
848 lagest = j;
849 }
850 }
851 tmpDec = dec_values[lagest];
852 dec_values[lagest] = dec_values[i];
853 dec_values[i] = tmpDec;
854 tmpObj = predictionList[lagest];
855 predictionList[lagest] = predictionList[i];
856 predictionList[i] = tmpObj;
857 }
858
859 int k = nr_class-1;
860 if (kBestList.getK() != -1) {
861 k = kBestList.getK() - 1;
862 }
863
864 for (i=0; i<nr_class && k >= 0; i++, k--) {
865 if (kBestList instanceof ScoredKBestList) {
866 ((ScoredKBestList)kBestList).add(predictionList[i], (float)dec_values[i]);
867 } else {
868 kBestList.add(predictionList[i]);
869 }
870
871 }
872 }
873
874 /**
875 * Converts the instance file (Malt's own SVM format) into the Liblinear (SVMLight) format. The input instance file is removed (replaced)
876 * by the instance file in the Liblinear (SVMLight) format. If a column contains -1, the value will be removed in destination file.
877 *
878 * @param isr the input stream reader for the source instance file
879 * @param osw the output stream writer for the destination instance file
880 * @param cardinalities a vector containing the number of distinct values for a particular column
881 * @throws LiblinearException
882 */
883 public static void maltSVMFormat2OriginalSVMFormat(InputStreamReader isr, OutputStreamWriter osw, int[] cardinalities) throws MaltChainedException {
884 try {
885 final BufferedReader in = new BufferedReader(isr);
886 final BufferedWriter out = new BufferedWriter(osw);
887
888 int c;
889 int j = 0;
890 int offset = 1;
891 int code = 0;
892 while(true) {
893 c = in.read();
894 if (c == -1) {
895 break;
896 }
897
898 if (c == '\t' || c == '|') {
899 if (j == 0) {
900 out.write(Integer.toString(code));
901 j++;
902 } else {
903 if (code != -1) {
904 out.write(' ');
905 out.write(Integer.toString(code+offset));
906 out.write(":1");
907 }
908 if (c == '\t') {
909 offset += cardinalities[j-1];
910 j++;
911 }
912 }
913 code = 0;
914 } else if (c == '\n') {
915 j = 0;
916 offset = 1;
917 out.write('\n');
918 code = 0;
919 } else if (c == '-') {
920 code = -1;
921 } else if (code != -1) {
922 if (c > 47 && c < 58) {
923 code = code * 10 + (c-48);
924 } else {
925 throw new LiblinearException("The instance file contain a non-integer value, when converting the Malt SVM format into Liblinear format.");
926 }
927 }
928 }
929 in.close();
930 out.close();
931 } catch (IOException e) {
932 throw new LiblinearException("Cannot read from the instance file, when converting the Malt SVM format into Liblinear format. ", e);
933 }
934 }
935
936 protected void finalize() throws Throwable {
937 try {
938 closeInstanceWriter();
939 } finally {
940 super.finalize();
941 }
942 }
943
944 /* (non-Javadoc)
945 * @see java.lang.Object#toString()
946 */
947 public String toString() {
948 final StringBuffer sb = new StringBuffer();
949 sb.append("\nLiblinear INTERFACE\n");
950 sb.append(" Liblinear version: "+LIBLINEAR_VERSION+"\n");
951 sb.append(" Liblinear string: "+paramString+"\n");
952
953 sb.append(getLibLinearOptions());
954 return sb.toString();
955 }
956
957
958 @Override
959 public void divideByFeatureSet(
960 Set<Integer> featureIdsToCreateSeparateBranchesForSet, ArrayList<Integer> divideFeatureIndexVector, String otherId) throws MaltChainedException {
961
962
963 //Create a hash map that maps every feature id to a writer
964 HashMap<Integer, BufferedWriter> featureIdToWriterMap = new HashMap<Integer, BufferedWriter>();
965
966 for(int element:featureIdsToCreateSeparateBranchesForSet){
967
968
969 BufferedWriter outputWriter = new BufferedWriter(getConfiguration().getConfigurationDir().getOutputStreamWriter(owner.getModelName().replace('.','_') + element + "." + getLearningMethodName()+".ins"));
970 featureIdToWriterMap.put(element, outputWriter);
971
972 }
973
974 BufferedWriter otherOutputWriter = new BufferedWriter(getConfiguration().getConfigurationDir().getOutputStreamWriter(owner.getModelName().replace('.','_') + otherId + "." + getLearningMethodName()+".ins"));
975
976
977 try {
978 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins"));
979 //every line will be written to a separate file
980 String line = in.readLine();
981 final Pattern tabPattern = Pattern.compile("\t");
982 while(line!=null){
983
984 //Find out which pot the line shall be put in
985 String[] lineArray = tabPattern.split(line);
986
987 int id = new Integer(lineArray[divideFeatureIndexVector.get(0)+1]);
988
989 if(!featureIdToWriterMap.containsKey(id)){
990 otherOutputWriter.write(line + "\n");
991 }else
992 featureIdToWriterMap.get(id).write(getLineToWrite(lineArray,divideFeatureIndexVector.get(0)+1));
993
994 line = in.readLine();
995 }
996
997 otherOutputWriter.close();
998
999 in.close();
1000
1001 for(BufferedWriter writer: featureIdToWriterMap.values())
1002 writer.close();
1003
1004 } catch (SecurityException e) {
1005 throw new LiblinearException("The Liblinear learner cannot remove the instance file. ", e);
1006 } catch (NullPointerException e) {
1007 throw new LiblinearException("The instance file cannot be found. ", e);
1008 } catch (FileNotFoundException e) {
1009 throw new LiblinearException("The instance file cannot be found. ", e);
1010 } catch (IOException e) {
1011 throw new LiblinearException("The Liblinear learner read from the instance file. ", e);
1012 }
1013
1014
1015
1016 }
1017
1018
1019 private String getLineToWrite(String[] lineArray, int excludeIndex) {
1020 StringBuffer buf = new StringBuffer();
1021
1022 for(int n = 0; n < lineArray.length; n++)
1023 if(n != excludeIndex)
1024 buf.append(lineArray[n] + "\t");
1025 buf.append("\n");
1026 return buf.toString();
1027 }
1028
1029
1030 @Override
1031 public Map<Integer, Integer> createFeatureIdToCountMap(
1032 ArrayList<Integer> divideFeatureIndexVector) throws MaltChainedException{
1033
1034 HashMap<Integer, Integer> featureIdToCountMap = new HashMap<Integer, Integer>();
1035
1036 //Go trough the file and count all feature ids in the given column(s)
1037
1038 try {
1039 final BufferedReader in = new BufferedReader(getInstanceInputStreamReader(".ins"));
1040 //every line will be written to a separate file
1041 String line = in.readLine();
1042 final Pattern tabPattern = Pattern.compile("\t");
1043 while(line!=null){
1044
1045 //Find out which pot the line shall be put in
1046 String[] lineArray = tabPattern.split(line);
1047
1048 for(int n = 0; n < divideFeatureIndexVector.size(); n++){
1049 int id = new Integer(lineArray[divideFeatureIndexVector.get(n)+1]);
1050
1051
1052 if (!featureIdToCountMap.containsKey(id)) {
1053
1054 featureIdToCountMap.put(id, 0);
1055
1056 }
1057
1058 int previousCount = featureIdToCountMap.get(id);
1059
1060 featureIdToCountMap.put(id, previousCount + 1);
1061
1062 }
1063
1064 line = in.readLine();
1065 }
1066 in.close();
1067 } catch (SecurityException e) {
1068 throw new LiblinearException("The Libsvm learner cannot remove the instance file. ", e);
1069 } catch (NullPointerException e) {
1070 throw new LiblinearException("The instance file cannot be found. ", e);
1071 } catch (FileNotFoundException e) {
1072 throw new LiblinearException("The instance file cannot be found. ", e);
1073 } catch (IOException e) {
1074 throw new LiblinearException("The Liblinear learner read from the instance file. ", e);
1075 }
1076
1077 return featureIdToCountMap;
1078 }
1079
1080 }