Commit 3092cf31 authored by Markus Krug's avatar Markus Krug
Browse files

Lots of changes ; before moving the repo

parent 8d923205
......@@ -6,5 +6,18 @@ Bundle-Version: 1.0.0.qualifier
Bundle-ClassPath: .
Require-Bundle: de.uniwue.mk.kall.mlf.features;bundle-version="1.0.0",
de.uniwue.mk.kall.mlf.libraries;bundle-version="1.0.0"
Export-Package: de.uniwue.mk.kall.mlf.classifier.application,
de.uniwue.mk.kall.mlf.classifier.trainer
Export-Package: de.uniwue.mk.kall.mlf.classifier.application;
uses:="cc.mallet.classify,
libsvm,
weka.classifiers,
cc.mallet.fst,
weka.core,
cc.mallet.pipe,
cc.mallet.types",
de.uniwue.mk.kall.mlf.classifier.trainer;
uses:="de.uniwue.mk.kall.mlf.classifier.application,
cc.mallet.classify,
libsvm,
weka.classifiers,
weka.core,
cc.mallet.types"
......@@ -13,4 +13,13 @@ public class CRFApplication {
}
public static Double[] applyForConfidence(File testFile, File modelFile) throws Exception {
SimpleTagger.main(new String[] { "--train", "false", "--model-file",
modelFile.getAbsolutePath(), testFile.getAbsolutePath() });
return SimpleTagger.weights.toArray(new Double[0]);
}
}
......@@ -139,4 +139,59 @@ public class MaxEntClassifierApplication {
}
public static String[] apply(String features, MaxEnt model) {
List<String> resultList = new ArrayList<String>();
String[] lines = features.split("\n");
for (String line : lines) {
if (line.isEmpty())
continue;
String[] split = line.split(" ");
StringBuilder featureBuilder = new StringBuilder();
for (int i = 0; i < split.length - 1; i++) {
featureBuilder.append(split[i]).append(" ");
}
// classify
String instance = featureBuilder.toString().replaceAll("=", "");
Classification classify = model.classify(instance);
resultList.add(classify.getLabeling().getBestLabel().toString());
}
return resultList.toArray(new String[0]);
}
public static Double[] applyForConfidence(String features, MaxEnt model) {
List<Double> resultList = new ArrayList<Double>();
String[] lines = features.split("\n");
for (String line : lines) {
if (line.isEmpty())
continue;
String[] split = line.split(" ");
StringBuilder featureBuilder = new StringBuilder();
for (int i = 0; i < split.length - 1; i++) {
featureBuilder.append(split[i]).append(" ");
}
// classify
String instance = featureBuilder.toString().replaceAll("=", "");
Classification classify = model.classify(instance);
resultList.add(classify.getLabeling().getBestValue());
}
return resultList.toArray(new Double[0]);
}
}
......@@ -15,12 +15,12 @@ public class SVMHMMClassifierApplication {
public static String[] apply(File testFile, File modelFile) throws Exception {
File tempFile = new File(
"C:\\Users\\mkrug\\workspace\\de.uniwue.mk.kall.mlf.classifier\\svmhmm\\tempDataResults");
"\\\\hastur\\scratch\\kallimachos\\kalimachos Doks\\NER journal\\dataset\\features\\tempSVMHMM.txt");
ProcessBuilder probuilder = new ProcessBuilder(
new String[] {
new File(
"C:\\Users\\mkrug\\workspace\\de.uniwue.mk.kall.mlf.classifier\\svmhmm\\svm_hmm_classify.exe")
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\src\\de\\uniwue\\mk\\kall\\mlf\\classifier\\trainer\\svm_hmm_classify.exe")
.getAbsolutePath(), testFile.getAbsolutePath(),
modelFile.getAbsolutePath(), tempFile.getAbsolutePath() });
......
package de.uniwue.mk.kall.mlf.classifier.application;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
public class SVMLightClassifierApplication {
public static String[] apply(File testFile, File modelFile) throws Exception {
File tempFile = new File(
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\svmhmm\\tempDataResults");
ProcessBuilder probuilder = new ProcessBuilder(
new String[] {
new File(
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\svmlight\\svm_classify.exe")
.getAbsolutePath(), testFile.getAbsolutePath(),
modelFile.getAbsolutePath(), tempFile.getAbsolutePath() });
// You can set up your work directory
Process process = probuilder.start();
// Read out dir output
InputStream is = process.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
// System.out.println(line);
}
// Wait to get exit value
try {
int exitValue = process.waitFor();
// System.out.println("\n\nExit Value is " + exitValue);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// read the tempfile
List<String> lines = Files.readAllLines(tempFile.toPath(), StandardCharsets.UTF_8);
String[] results = new String[lines.size()];
int index = 0;
for (String l : lines) {
double parseDouble = Double.parseDouble(l.trim());
if (parseDouble >= 0.0)
results[index] = "1";
else {
results[index] = "-1";
}
index++;
}
return results;
}
public static String[] applyForFullDistance(File testFile, File modelFile) throws Exception {
File tempFile = new File(
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\svmhmm\\tempDataResults");
ProcessBuilder probuilder = new ProcessBuilder(
new String[] {
new File(
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\svmlight\\svm_classify.exe")
.getAbsolutePath(), testFile.getAbsolutePath(),
modelFile.getAbsolutePath(), tempFile.getAbsolutePath() });
// You can set up your work directory
Process process = probuilder.start();
// Read out dir output
InputStream is = process.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
// System.out.println(line);
}
// Wait to get exit value
try {
int exitValue = process.waitFor();
// System.out.println("\n\nExit Value is " + exitValue);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// read the tempfile
List<String> lines = Files.readAllLines(tempFile.toPath(), StandardCharsets.UTF_8);
return lines.toArray(new String[0]);
}
}
......@@ -515,6 +515,8 @@ public static Sequence[] apply(Transducer model, Sequence input, int k)
public static StringBuilder output = new StringBuilder();
public static List<Double> weights;
public static double probability;
public static void main (String[] args) throws Exception
{
output = new StringBuilder();
......
package de.uniwue.mk.kall.mlf.classifier.application;
import java.io.File;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import cc.mallet.classify.Classification;
import de.uniwue.mk.kall.mlf.featureRepresentation.util.MalletIO;
public class SkipChainCRFApplication {
// TODO lazy so this is not a skip chian CRF-yet
public static String[] apply(File testFile, File modelFile) throws Exception {
MalletIO ioHandler = new MalletIO();
// read the Model from modelFile
cc.mallet.classify.DecisionTree model = (cc.mallet.classify.DecisionTree) ioHandler
.deserialize(modelFile);
List<String> resultList = new ArrayList<String>();
List<String> lines = Files.readAllLines(testFile.toPath(), Charset.defaultCharset());
int index = 0;
for (String line : lines) {
if (line.isEmpty())
continue;
String[] split = line.split(" ");
StringBuilder featureBuilder = new StringBuilder();
for (int i = 0; i < split.length - 1; i++) {
featureBuilder.append(split[i]).append(" ");
}
// classify
String instance = featureBuilder.toString().replaceAll("=", "");
Classification classify = model.classify(instance);
resultList.add(classify.getLabeling().getBestLabel().toString());
}
return resultList.toArray(new String[0]);
}
}
......@@ -17,7 +17,7 @@ public class SVMHMMClassifierTrainer {
new String[] {
new File(
"C:\\Users\\mkrug\\workspace\\de.uniwue.mk.kall.mlf.classifier\\svmhmm\\svm_hmm_learn.exe")
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\src\\de\\uniwue\\mk\\kall\\mlf\\classifier\\trainer\\svm_hmm_learn.exe")
.getAbsolutePath(), "-c", String.valueOf(slackVariable), "-e", "0.05",
trainFile.getAbsolutePath(), modelFile.getAbsolutePath() });
......
package de.uniwue.mk.kall.mlf.classifier.trainer;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
public class SVMLightClassifierTrainer {
/*
* Available options are:
*
* General options: -? - this help -v [0..3] - verbosity level (default 1) Learning options: -z
* {c,r} -> select between classification and regression (default classification) -c float - C:
* trade-off between training error and margin (default [avg. x*x]^-1) -w [0..] -> epsilon width
* of tube for regression (default 0.1) -j float - Cost: cost-factor, by which training errors on
* positive examples outweight errors on negative examples (default 1) (see [4]) -b [0,1] - use
* biased hyperplane (i.e. x*w+b0) instead of unbiased hyperplane (i.e. x*w0) (default 1) -i [0,1]
* - remove inconsistent training examples and retrain (default 0) Performance estimation options:
* -x [0,1] - compute leave-one-out estimates (default 0) (see [5]) -o ]0..2] - value of rho for
* XiAlpha-estimator and for pruning leave-one-out computation (default 1.0) (see [2]) -k [0..100]
* - search depth for extended XiAlpha-estimator (default 0) Transduction options (see [3]): -p
* [0..1] - fraction of unlabeled examples to be classified into the positive class (default is
* the ratio of positive and negative examples in the training data) Kernel options: -t int - type
* of kernel function: 0: linear (default) 1: polynomial (s a*b+c)^d 2: radial basis function
* exp(-gamma ||a-b||^2) 3: sigmoid tanh(s a*b + c) 4: user defined kernel from kernel.h -d int -
* parameter d in polynomial kernel -g float - parameter gamma in rbf kernel -s float - parameter
* s in sigmoid/poly kernel -r float - parameter c in sigmoid/poly kernel -u string - parameter of
* user defined kernel Optimization options (see [1]): -q [2..] - maximum size of QP-subproblems
* (default 10) -n [2..q] - number of new variables entering the working set in each iteration
* (default n = q). Set n<q to prevent zig-zagging. -m [5..] - size of cache for kernel
* evaluations in MB (default 40) The larger the faster... -e float - eps: Allow that error for
* termination criterion [y [w*x+b] - 1] = eps (default 0.001) -h [5..] - number of iterations a
* variable needs to be optimal before considered for shrinking (default 100) -f [0,1] - do final
* optimality check for variables removed by shrinking. Although this test is usually positive,
* there is no guarantee that the optimum was found if the test is omitted. (default 1) Output
* options: -l char - file to write predicted labels of unlabeled examples into after transductive
* learning -a char - write all alphas to this file after learning (in the same order as in the
* training set)
*/
public static void train(File trainFile, File modelFile) throws Exception {
ProcessBuilder probuilder = new ProcessBuilder(
new String[] {
new File(
"C:\\Users\\mkrug\\git\\Kallimachos_MachineLearning\\de.uniwue.mk.kall.mlf.classifier\\svmlight\\svm_learn.exe")
.getAbsolutePath(), "-c", "0.75",
trainFile.getAbsolutePath(), modelFile.getAbsolutePath() });
// You can set up your work directory
Process process = probuilder.start();
// Read out dir output
InputStream is = process.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null) {
System.out.println(line);
}
// Wait to get exit value
try {
int exitValue = process.waitFor();
System.out.println("\n\nExit Value is " + exitValue);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
This diff is collapsed.
package de.uniwue.mk.kall.mlf.clusterFeatures;
import java.io.BufferedReader;
......
package de.uniwue.mk.kall.featuregenerator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;
import de.uniwue.mk.kall.mlf.featureRepresentation.formats.AKallimachosFeatureGenerator;
import de.uniwue.mkrug.kall.typesystemutil.Util_impl;
public abstract class AKallimachosFeaturesToTxtGenerator {
// we dont use it per default since it makes the featureset about to explode
protected boolean useFeatureConjunction = false;
protected List<String> performFeatureGeneration(CAS cas, Util_impl util,
List<AKallimachosFeatureGenerator> generators, AnnotationFS token) {
Set<String> features = new HashSet<String>();
List<String> featuresTemp = new ArrayList<String>();
for (AKallimachosFeatureGenerator gen : generators) {
String[] generateFeatures = gen.generateFeatures(cas, token);
featuresTemp.addAll(Arrays.asList(generateFeatures));
}
// perform featureconunction ???
if (!useFeatureConjunction) {
return featuresTemp;
}
// here we perfomr feature conjunction
for (String s : featuresTemp) {
for (String k : featuresTemp) {
// if the datapoints are equal we only need 1
if (s.equals(k)) {
features.add(k);
}
else {
// else we combine them and add them into the hashset as combination
// TODO
}
}
}
return null;
}
protected List<String> performFeatureGenerationForRanking(CAS cas, Util_impl util,
List<AKallimachosFeatureGenerator> generators, String toRank) {
Set<String> features = new HashSet<String>();
List<String> featuresTemp = new ArrayList<String>();
for (AKallimachosFeatureGenerator gen : generators) {
String[] generateFeatures = gen.generateFeatures(toRank, cas);
featuresTemp.addAll(Arrays.asList(generateFeatures));
}
// perform featureconunction ???
if (!useFeatureConjunction) {
return featuresTemp;
}
// here we perfomr feature conjunction
for (String s : featuresTemp) {
for (String k : featuresTemp) {
// if the datapoints are equal we only need 1
if (s.equals(k)) {
features.add(k);
}
else {
// else we combine them and add them into the hashset as combination
// TODO
}
}
}
return null;
}
}
package de.uniwue.mk.kall.featuregenerator;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.xml.sax.SAXException;
import de.uniwue.mk.kall.mlf.featureRepresentation.formats.AKallimachosFeatureGenerator;
import de.uniwue.mkrug.kall.typesystemutil.Util_impl;
public class KallimachosEntRankingFeatureGenerator extends AKallimachosFeaturesToTxtGenerator {
// generates features with the ranking feature generator
public void generateFeatures(File toWrite, boolean append,
List<AKallimachosFeatureGenerator> generators, AKallimachosFeatureGenerator goldgen,
File inputFile) throws IOException, SAXException {
FileWriter fw = new FileWriter(toWrite, append);
// for each File
for (File f : inputFile.listFiles()) {
System.out.println("Generate Features for " + f.getName());
CAS cas = Util_impl.createCas();
XmiCasDeserializer.deserialize(new FileInputStream(f), cas);
Util_impl util = new Util_impl(cas);
// create all cluster from the NEs
HashMap<String, List<String>> clusterMaps = generateClusterMaps(cas, util);
// for each sentence... we write one sequence to the datapoints file
for (String s : clusterMaps.keySet()) {
StringBuilder sb = new StringBuilder();
// for each token in that sentence
for (String toRank : clusterMaps.get(s)) {
// 1 line is added for each token
// generate all datapoints for this token
List<String> features = performFeatureGenerationForRanking(cas, util, generators, toRank);
// append all datapoints to sb each separated by a blank " "
for (String feature : features) {
sb.append(feature).append(" ");