package net.optilab.prototypes.cf_ranking.test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.Random;

import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.trees.J48;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffLoader;
import weka.core.converters.ArffSaver;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class Weka {
	public static void main(String[] args) throws Exception {
		//1.ATTRIBUTES
		//numeric
		Attribute attr = new Attribute("my-numeric");
		System.out.println(attr.isNumeric());
		
		//nominal
		FastVector myNomVals = new FastVector();
		for (int i=0; i<10; i++)
			myNomVals.addElement("value_"+i);
		Attribute attr1 = new Attribute("my-nominal", myNomVals);
		System.out.println(attr1.isNominal());
		
		//string
		Attribute attr2 = new Attribute("my-string", (FastVector)null);
		System.out.println(attr2.isString());
		
		//date
		Attribute attr3 = new Attribute("my-date", "dd-MM-yyyy");
		System.out.println(attr3.isDate());
		
		//whole relation can also be an attr
		//Attribute attr4 = new Attribute("my-relation", new Instances(...));
		
		//2.create dataset
		FastVector attrs = new FastVector();
		attrs.addElement(attr);
		attrs.addElement(attr1);
		attrs.addElement(attr2);
		attrs.addElement(attr3);
		Instances dataset = new Instances("my_dataset", attrs, 0);
		
		//3.add instances
		//first instance
		double[] attValues = new double[dataset.numAttributes()];
			attValues[0] = 55;
			attValues[1] = dataset.attribute("my-nominal").indexOfValue("value_5");
			attValues[2] = dataset.attribute("my-string").addStringValue("Slavko");
			attValues[3] = dataset.attribute("my-date").parseDate("7-6-1987");
		dataset.add(new Instance(1.0, attValues));
		
		//second instance
		attValues = new double[dataset.numAttributes()];
			attValues[0] = Instance.missingValue();
			attValues[1] = dataset.attribute(1).indexOfValue("value_9");
			attValues[2] = dataset.attribute(2).addStringValue("Marinka");
			attValues[3] = dataset.attribute(3).parseDate("23-4-1989");
		dataset.add(new Instance(1.0, attValues));
		
		//third instance
		Instance example = new Instance(4);
			example.setValue(attr, 16);
			example.setValue(attr1, "value_7");
			example.setValue(attr2, "Mirko");
			example.setValue(attr3, attr3.parseDate("1-1-1988"));
		dataset.add(example);
	
		//4.output dataset
		System.out.println(dataset);
		
		//5.save dataset
		String file = "C:\\temp\\weka_test.arff";
		ArffSaver saver = new ArffSaver();
		saver.setInstances(dataset);
		saver.setFile(new File(file));
		saver.writeBatch();
		
		//6.read dataset
		ArffLoader loader = new ArffLoader();
		loader.setFile(new File(file));
		dataset = loader.getDataSet();	
		
		//7.preprocess strings (almost no classifier supports them)
	    StringToWordVector filter = new StringToWordVector();
	    filter.setInputFormat(dataset);
	    dataset = Filter.useFilter(dataset, filter);
	    System.out.println(dataset);
	    
		//8.build classifier
		dataset.setClassIndex(1);
		Classifier classifier = new J48();
		classifier.buildClassifier(dataset);
		
		//9.save classifier
		OutputStream os = new FileOutputStream(file);
		ObjectOutputStream objectOutputStream = new ObjectOutputStream(os);
		objectOutputStream.writeObject(classifier);
		
		//10. read classifier back
		InputStream is = new FileInputStream(file);
		ObjectInputStream objectInputStream = new ObjectInputStream(is);
		classifier = (Classifier) objectInputStream.readObject();
		objectInputStream.close();
		
		//11.evaluate
		//resample if needed
		dataset = dataset.resample(new Random(42));
		//split to 70:30 learn and test set
		double percent = 70.0;
		int trainSize = (int) Math.round(dataset.numInstances() * percent / 100);
		int testSize = dataset.numInstances() - trainSize;
		Instances train = new Instances(dataset, 0, trainSize);
		Instances test = new Instances(dataset, trainSize, testSize);
		train.setClassIndex(1);
		test.setClassIndex(1);
		//do eval
		Evaluation eval = new Evaluation(train); //trainset
		eval.evaluateModel(classifier, test); //testset
		System.out.println(eval.toSummaryString());
		System.out.println(eval.weightedFMeasure());
		System.out.println(eval.weightedPrecision());
		System.out.println(eval.weightedRecall());
		
		//12.classify
		//result
		System.out.println(classifier.classifyInstance(dataset.firstInstance()));
		//classified result value
		System.out.println(dataset.attribute(dataset.classIndex()).value((int)dataset.firstInstance().classValue()));
		System.out.println(classifier.distributionForInstance(dataset.firstInstance()));
	}
}

