Commit d95e3b1c authored by mak28ma's avatar mak28ma
Browse files

few further updates

parent 628ae849
......@@ -39,6 +39,8 @@ public class MatrixUtil {
int formerMatrixSize = matrixface.getTpMatrix().numRows() == matrixface.getTpMatrix().numColumns()
? matrixface.getTpMatrix().numColumns() : -1;
int maxCoverage = 0;
MatrixMapping matrixMapping = new MatrixMapping(formerMatrixSize);
// collect all entries that may remain
......@@ -52,6 +54,9 @@ public class MatrixUtil {
for (MatrixEntry entry : matrixface.getTpMatrix()) {
double amountTP = entry.get();
// log the best possible coverage for logging issues
if (amountTP > maxCoverage)
maxCoverage = (int) amountTP;
// save all that may be kept per default any that have more TP may
// be kept since the have potential
if (amountTP >= maximum) {
......@@ -132,6 +137,7 @@ public class MatrixUtil {
// entry.getTp() + "\tFP" + entry.getFp()
// + "\t" + representationRule.toString());
// }
//logger.info("Best possible coverage: " + maxCoverage);
return matrixMapping;
}
......
......@@ -29,7 +29,7 @@ public class EACL_UIMA_FeatureGenB_E extends AFeatureGenerator {
return new String[] { super.featureIdentifier + "=" + "E-"+name };
}
}
return new String[] { LabelAlphabet.getFeatureToId(0) };
return new String[] { super.featureIdentifier + "=" + "O" };
}
}
......@@ -15,7 +15,7 @@ public class NNFeatureGenerator extends AFeatureGenerator{
@Override
public String[] generateFeatures(AnnotationFS token) {
String label = "NE";
String label = "N";
String featureValueAsString = token.getFeatureValueAsString(token.getType().getFeatureByBaseName(posTagFeature));
return new String[]{featureValueAsString.startsWith(label)?super.featureIdentifier+"="+label:super.featureIdentifier+"=O"};
}
......
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import java.util.List;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.CasUtil;
public class UIMATypeAndFeatureGenerator extends AFeatureGenerator {
private String typeS;
private String featureS;
public UIMATypeAndFeatureGenerator(String typeS, String featureS) {
super("UIMA_T_AND_F_"+typeS+"_" + featureS);
this.typeS = typeS;
this.featureS = featureS;
}
@Override
public String[] generateFeatures(AnnotationFS token) {
Type type = token.getCAS().getTypeSystem().getType(typeS);
Feature feat = type.getFeatureByBaseName(featureS);
List<AnnotationFS> selectCovered = CasUtil.selectCovered(type, token);
if (selectCovered.size() == 1) {
return new String[]{super.featureIdentifier+"="+ selectCovered.get(0).getFeatureValueAsString(feat)};
}
return new String[0];
}
}
......@@ -12,6 +12,7 @@ import org.slf4j.LoggerFactory;
import de.uniwue.ls6.datastructure.ALabelling;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.LabelAlphabet;
import de.uniwue.ls6.datastructure.SimpleLabelling;
import de.uniwue.ls6.rulelearning.algorithm.IRepresentationRuleLearningAlgorithm;
......@@ -21,10 +22,13 @@ public class MultiClassRepresentationRuleAlgorithm implements IRepresentationRul
List<BinaryRepresentationRuleLearningAlgorithm> binaryClassifiers;
private int beamSize;
private List<Integer> skipClassifierTrainingForLabel;
private Map<Instance, Integer> instanceToLabelMapping;
public MultiClassRepresentationRuleAlgorithm(int beamSize) {
this.beamSize = beamSize;
this.skipClassifierTrainingForLabel = new ArrayList<>();
}
public void learn(Instance... instances) {
......@@ -38,9 +42,11 @@ public class MultiClassRepresentationRuleAlgorithm implements IRepresentationRul
binaryClassifiers = new ArrayList<>();
for (Integer goldLabel : amountDistinctLabels) {
BinaryRepresentationRuleLearningAlgorithm binaryClassifier = new BinaryRepresentationRuleLearningAlgorithm(
goldLabel, 0, beamSize);
binaryClassifiers.add(binaryClassifier);
if (!skipClassifierTrainingForLabel.contains(goldLabel)) {
BinaryRepresentationRuleLearningAlgorithm binaryClassifier = new BinaryRepresentationRuleLearningAlgorithm(
goldLabel, 0, beamSize);
binaryClassifiers.add(binaryClassifier);
}
}
// train one after another
......@@ -123,4 +129,9 @@ public class MultiClassRepresentationRuleAlgorithm implements IRepresentationRul
return new SimpleLabelling(label, (maxScore / fullScore));
}
public void skipTrainingForLabel(String string) {
this.skipClassifierTrainingForLabel.add(LabelAlphabet.getIdToFeature(string));
}
}
......@@ -39,13 +39,13 @@ public class FirstTest {
TypeSystemDescription tsd = TypeSystemDescriptionFactory
.createTypeSystemDescriptionFromPath(typesystem.toURL().toString());
List<Instance> instances = InstanceCreationFactory.createWindowedInstancesFromUIMA(document, 0,2, 2,
List<Instance> instances = InstanceCreationFactory.createWindowedInstancesFromUIMA(document, 0,0, 0,
"de.uniwue.kalimachos.coref.type.POS", tsd, new POSTagFeatureGenerator("POSTag"),
new WordFeaturegenerator(), new IsUppercaseFeatureGenerator(), new PrefixNGenerator(3),
new NGramGenerator(),new WordCategorization());
// create 5 folds
List<UnstructuredFold> folds = FoldUtil.readInstancesToFold(instances, new Random(13374211), 10);
List<UnstructuredFold> folds = FoldUtil.readInstancesToFold(instances, new Random(13374211), 2);
for (UnstructuredFold fold : folds) {
......
......@@ -5,23 +5,32 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.xml.sax.SAXException;
import de.uniwue.ls6.datastructure.ALabelling;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.LabelAlphabet;
import de.uniwue.ls6.datastructure.SimpleLabelling;
import de.uniwue.ls6.rulelearning.algorithm.impl.BinaryRepresentationRuleLearningAlgorithm;
import de.uniwue.ls6.rulelearning.algorithm.impl.MultiClassRepresentationRuleAlgorithm;
import de.uniwue.ls6.rulelearning.evaluation.eval.LabelAccuracyEvaluation;
import de.uniwue.ls6.rulelearning.evaluation.fold.FoldUtil;
import de.uniwue.ls6.rulelearning.evaluation.fold.UnstructuredFold;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.BIO_UIMA_FeatureGen;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.IsUppercaseFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.LemmaFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.NGramGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.NNFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.POSTagFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.PrefixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.SuffixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.UIMATypeAndFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordCategorization;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordFeaturegenerator;
import de.uniwue.ls6.rulelearning.instanceloading.io.InstanceCreationFactory;
......@@ -35,7 +44,7 @@ public class FirstTest2 {
File korpusFOlder = new File("X:\\Neuer Ordner\\output+speech");
MultiClassRepresentationRuleAlgorithm algorithm = new MultiClassRepresentationRuleAlgorithm(1500);
MultiClassRepresentationRuleAlgorithm algorithm = new MultiClassRepresentationRuleAlgorithm(100);
TypeSystemDescription tsd = TypeSystemDescriptionFactory
.createTypeSystemDescriptionFromPath(typesystem.toURL().toString());
......@@ -44,20 +53,49 @@ public class FirstTest2 {
if (!f.getName().endsWith(".xmi"))
continue;
instances.addAll(InstanceCreationFactory.createWindowedInstancesFromUIMA(f, instances.size(), 2, 2,
"de.uniwue.kalimachos.coref.type.POS", tsd, new BIO_UIMA_FeatureGen("de.uniwue.kalimachos.coref.type.NamedEntity", "NE"),
"de.uniwue.kalimachos.coref.type.POS", tsd,
new BIO_UIMA_FeatureGen("de.uniwue.kalimachos.coref.type.NamedEntity", "NE"),
new WordFeaturegenerator(), new SuffixNGenerator(4), new SuffixNGenerator(3),
new SuffixNGenerator(2), new SuffixNGenerator(1), new PrefixNGenerator(1),
new IsUppercaseFeatureGenerator(),new NGramGenerator() ));
new IsUppercaseFeatureGenerator(), new NGramGenerator(), new WordCategorization(),
new LemmaFeatureGenerator("Lemma"), new POSTagFeatureGenerator("POSTag"),
new UIMATypeAndFeatureGenerator("de.uniwue.kalimachos.coref.type.DependencyParse",
"DependencyRelation"),new UIMATypeAndFeatureGenerator("de.uniwue.kalimachos.coref.type.DependencyParse",
"Headname")));
System.out.println("instances: " + instances.size());
// if(instances.size()>20000)break;
if (instances.size() > 10000)
break;
}
int id =0;
for(Instance i : instances){
int id = 0;
for (Instance i : instances) {
i.setId(id);
id++;
}
algorithm.learn(instances.toArray(new Instance[0]));
// create 5 folds
List<UnstructuredFold> folds = FoldUtil.readInstancesToFold(instances, new Random(13374211), 5);
for (UnstructuredFold fold : folds) {
algorithm.learn(fold.getTrainingset().toArray(new Instance[0]));
// evaluate
List<ALabelling> goldLabels = new ArrayList<>();
List<ALabelling> systemLabels = new ArrayList<>();
for (Instance i : fold.getTestSet()) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
// evaluate this fold
String evaluateToString = new LabelAccuracyEvaluation()
.evaluateToString(goldLabels.toArray(new ALabelling[0]), systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToString);
break;
}
}
......
package test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.xml.sax.SAXException;
import de.uniwue.ls6.datastructure.ALabelling;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.LabelAlphabet;
import de.uniwue.ls6.datastructure.SimpleLabelling;
import de.uniwue.ls6.rulelearning.algorithm.impl.BinaryRepresentationRuleLearningAlgorithm;
import de.uniwue.ls6.rulelearning.algorithm.impl.MultiClassRepresentationRuleAlgorithm;
import de.uniwue.ls6.rulelearning.evaluation.eval.LabelAccuracyEvaluation;
import de.uniwue.ls6.rulelearning.evaluation.fold.FoldUtil;
import de.uniwue.ls6.rulelearning.evaluation.fold.UnstructuredFold;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.BIO_UIMA_FeatureGen;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.IsUppercaseFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.LemmaFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.NGramGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.NNFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.POSTagFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.PrefixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.SuffixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.UIMATypeAndFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordCategorization;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordFeaturegenerator;
import de.uniwue.ls6.rulelearning.instanceloading.io.InstanceCreationFactory;
public class FirstTest3 {
public static void main(String[] args) throws Exception {
File document = new File("resources\\Aston,-Louise__Lydia.xmi.xmi.xmi");
File doc2 = new File("resources\\Ahlefeld,-Charlotte-von_Erna1421[Lukas].xmi.xmi");
File typesystem = new File("resources\\MiKalliTypesystem.xml");
File korpusFOlder = new File("X:\\Neuer Ordner\\output+speech");
MultiClassRepresentationRuleAlgorithm algorithm = new MultiClassRepresentationRuleAlgorithm(100);
TypeSystemDescription tsd = TypeSystemDescriptionFactory
.createTypeSystemDescriptionFromPath(typesystem.toURL().toString());
List<Instance> instances = new ArrayList<Instance>();
for (File f : korpusFOlder.listFiles()) {
if (!f.getName().endsWith(".xmi"))
continue;
instances.addAll(InstanceCreationFactory.createWindowedInstancesFromUIMA(f, instances.size(), 2, 2,
"de.uniwue.kalimachos.coref.type.POS", tsd,
new NNFeatureGenerator("POSTag"),
new WordFeaturegenerator(), new SuffixNGenerator(4), new SuffixNGenerator(3),
new SuffixNGenerator(2), new SuffixNGenerator(1), new PrefixNGenerator(1),
new IsUppercaseFeatureGenerator(), new NGramGenerator(), new WordCategorization(),
new LemmaFeatureGenerator("Lemma"),
new UIMATypeAndFeatureGenerator("de.uniwue.kalimachos.coref.type.DependencyParse",
"DependencyRelation"),new UIMATypeAndFeatureGenerator("de.uniwue.kalimachos.coref.type.DependencyParse",
"Headname")));
System.out.println("instances: " + instances.size());
if (instances.size() > 50000)
break;
}
int id = 0;
for (Instance i : instances) {
i.setId(id);
id++;
}
// create 5 folds
List<UnstructuredFold> folds = FoldUtil.readInstancesToFold(instances, new Random(13374211), 5);
for (UnstructuredFold fold : folds) {
algorithm.learn(fold.getTrainingset().toArray(new Instance[0]));
// evaluate
List<ALabelling> goldLabels = new ArrayList<>();
List<ALabelling> systemLabels = new ArrayList<>();
for (Instance i : fold.getTestSet()) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
// evaluate this fold
String evaluateToString = new LabelAccuracyEvaluation()
.evaluateToString(goldLabels.toArray(new ALabelling[0]), systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToString);
break;
}
}
}
......@@ -37,7 +37,7 @@ public class FirstTestBinary {
File korpusFOlder = new File("X:\\Neuer Ordner\\output+speech");
BinaryRepresentationRuleLearningAlgorithm algorithm = new BinaryRepresentationRuleLearningAlgorithm(
LabelAlphabet.getIdToFeature("IsPOSTag=NE"), LabelAlphabet.getIdToFeature("IsPOSTag=O"), 100);
LabelAlphabet.getIdToFeature("IsPOSTag=N"), LabelAlphabet.getIdToFeature("IsPOSTag=O"), 100);
TypeSystemDescription tsd = TypeSystemDescriptionFactory
.createTypeSystemDescriptionFromPath(typesystem.toURL().toString());
......@@ -68,7 +68,7 @@ public class FirstTestBinary {
String evaluateToString = new LabelAccuracyEvaluation()
.evaluateToString(goldLabels.toArray(new ALabelling[0]), systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToString);
//break;
break;
}
}
......
......@@ -31,13 +31,14 @@ public class FirstTestEACL {
public static void main(String[] args) throws Exception {
File korpusFOlder = new File("C:\\Users\\mkrug\\owncloud_neu\\paper_RegelLernen\\xmi_gold_preprocessed");
String uriDkPro = new File("C:\\marian_eclipse\\workspace_mars\\dkprocoreExample\\TypeSystemDK.xml").toPath()
File korpusFOlder = new File("X:\\owncloud\\paper_RegelLernen\\xmi_gold_preprocessed");
String uriDkPro = new File("X:\\owncloud\\paper_RegelLernen\\\\TypeSystemDK.xml").toPath()
.toUri().toString();
TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath(uriDkPro);
MultiClassRepresentationRuleAlgorithm algorithm = new MultiClassRepresentationRuleAlgorithm(10);
algorithm.skipTrainingForLabel("Entity=O");
List<Instance> instances = new ArrayList<Instance>();
for (File f : korpusFOlder.listFiles()) {
......@@ -54,7 +55,7 @@ public class FirstTestEACL {
new EACL_UIMA_FeatureGen("de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "value","NE"),
new POSTagFeatureGenerator("PosValue")));
System.out.println("instances: " + instances.size());
// if(instances.size()>20000)break;
if(instances.size()>5000)break;
}
int id = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment