Commit 1bc99e89 authored by Markus Krug's avatar Markus Krug
Browse files

alles kaput!!

parent c3f06cc7
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
public class EACL_UIMA_FeatureGen extends AFeatureGenerator {
int prefixLen;
String typeS;
private String feature;
public EACL_UIMA_FeatureGen(String typeS,String feature, String labelId) {
super(labelId);
this.typeS = typeS;
this.feature = feature;
}
@Override
public String[] generateFeatures(AnnotationFS token) {
Type type= token.getCAS().getTypeSystem().getType(typeS);
for(AnnotationFS anno : token.getCAS().getAnnotationIndex(type)){
String name = type.getName();
name = anno.getFeatureValueAsString(type.getFeatureByBaseName(feature));
if(anno.getBegin()==token.getBegin() && anno.getEnd()==token.getEnd()){
return new String[] { super.featureIdentifier + "=" + "B-"+name };
} else if(token.getBegin()>anno.getBegin() && token.getEnd()<=anno.getEnd()){
return new String[] { super.featureIdentifier + "=" + "I-"+name };
}
}
return new String[] { super.featureIdentifier + "=" + "O" };
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import de.uniwue.ls6.datastructure.LabelAlphabet;
public class EACL_UIMA_FeatureGenB_E extends AFeatureGenerator {
int prefixLen;
String typeS;
private String feature;
public EACL_UIMA_FeatureGenB_E(String typeS,String feature, String labelId) {
super(labelId);
this.typeS = typeS;
this.feature = feature;
}
@Override
public String[] generateFeatures(AnnotationFS token) {
Type type= token.getCAS().getTypeSystem().getType(typeS);
for(AnnotationFS anno : token.getCAS().getAnnotationIndex(type)){
String name = type.getName();
name = anno.getFeatureValueAsString(type.getFeatureByBaseName(feature));
if(anno.getBegin()==token.getBegin() && anno.getEnd()==token.getEnd()){
return new String[] { super.featureIdentifier + "=" + "B-"+name };
} else if(token.getEnd()==anno.getEnd() ){
return new String[] { super.featureIdentifier + "=" + "E-"+name };
}
}
return new String[] { LabelAlphabet.getFeatureToId(0) };
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.uima.cas.text.AnnotationFS;
public class WordCategorization extends AFeatureGenerator {
public WordCategorization() {
super("WordBuilding");
}
@Override
public String[] generateFeatures(AnnotationFS token) {
String text = unifyString(token.getCoveredText());
int len = text.length();
List<String> ngrams = new ArrayList<String>();
for (int beg = 0; beg < len; beg++) {
for (int end = beg + 1; end <= len; end++) {
String ngram = text.substring(beg, end);
if (beg > 0)
ngram = "*" + ngram;
if (end < len)
ngram += "*";
ngrams.add(super.featureIdentifier + "=" + ngram);
}
}
return ngrams.toArray(new String[0]);
}
private String unifyString(String coveredText) {
String refinedString = "";
for (Character c : coveredText.toCharArray()) {
if (c.toString().matches("[a-z]")) {
refinedString += "x";
} else if (c.toString().matches("[A-Z]")) {
refinedString += "X";
} else if (c.toString().matches("[0-9]")) {
refinedString += "D";
} else {
refinedString += c.toString();
}
}
return refinedString;
}
}
This diff is collapsed.
......@@ -217,7 +217,7 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
// assert that the maximum is growing
assert (maximumScore <= iterationMatrix.getMaximumScore()) : "Maximum decreased within iteration!";
if (!betterRuleCanBeLearned(mappings.get(mappings.size() - 2), mappings.get(mappings.size() - 1))) {
if (!betterRuleCanBeLearned(maximumScore,iterationMatrix)) {
mappings.remove(mappingForMaximum);
break;
}
......@@ -242,7 +242,13 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
maximumScore, (maxEntryLocation.getTp() / (maxEntryLocation.getFp() + maxEntryLocation.getTp())));
}
private boolean betterRuleCanBeLearned(List<MatrixMapping> mappings) {
if (mappings.size() < 2)
return true;
return betterRuleCanBeLearned(mappings.get(mappings.size() - 2), mappings.get(mappings.size() - 1));
}
private boolean betterRuleCanBeLearned(MatrixMapping lastMapping, MatrixMapping newMapping) {
Set<Set<Point>> lastFeatures = new HashSet<Set<Point>>(lastMapping.getDenseIndexToFeaturesMapping().values());
Set<Set<Point>> newFeatures = new HashSet<Set<Point>>(newMapping.getDenseIndexToFeaturesMapping().values());
......@@ -251,8 +257,9 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
return false;
}
return true;
}
private boolean betterRuleCanBeLearned(int maximumScore, MatrixMcMatrixFace matrixInFocus) {
if (matrixInFocus.getMaximumScore() > maximumScore) {
......@@ -282,8 +289,8 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
RepresentationRule rule = pass.apply(instanceToClassify);
if (rule != null) {
predictedLabel = pass.getLabel();
score+=rule.getUniquenessScore();
//score = rule.getPrecision();
//score += rule.getUniquenessScore();
score = rule.getPrecision();
} else {
break;
}
......
......@@ -84,7 +84,10 @@ public class MultiClassRepresentationRuleAlgorithm implements IRepresentationRul
Set<Integer> labelSet = new HashSet<>();
for (Instance i : instances) {
labelSet.add(i.getLabel());
// never create a classifier for the DEFAULT label
if (i.getLabel() != 0) {
labelSet.add(i.getLabel());
}
instanceToLabelMapping.put(i, i.getLabel());
}
return labelSet;
......@@ -105,7 +108,7 @@ public class MultiClassRepresentationRuleAlgorithm implements IRepresentationRul
double maxScore = 0;
int label = 0;
for (ALabelling labelling : binaryLabellings) {
//we sum only non default classifications!
// we sum only non default classifications!
if (labelling.getLabel() == 0)
continue;
fullScore += labelling.getScore();
......
package test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import de.uniwue.ls6.datastructure.ALabelling;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.SimpleLabelling;
import de.uniwue.ls6.rulelearning.algorithm.impl.MultiClassRepresentationRuleAlgorithm;
import de.uniwue.ls6.rulelearning.evaluation.eval.LabelAccuracyEvaluation;
import de.uniwue.ls6.rulelearning.evaluation.fold.FoldUtil;
import de.uniwue.ls6.rulelearning.evaluation.fold.UnstructuredFold;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.BIO_UIMA_FeatureGen;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.EACL_UIMA_FeatureGen;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.EACL_UIMA_FeatureGenB_E;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.IsUppercaseFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.NGramGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.POSTagFeatureGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.PrefixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.SuffixNGenerator;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordCategorization;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordFeaturegenerator;
import de.uniwue.ls6.rulelearning.instanceloading.io.InstanceCreationFactory;
public class FirstTestEACL {
public static void main(String[] args) throws Exception {
File korpusFOlder = new File("C:\\Users\\mkrug\\owncloud_neu\\paper_RegelLernen\\xmi_gold_preprocessed");
String uriDkPro = new File("C:\\marian_eclipse\\workspace_mars\\dkprocoreExample\\TypeSystemDK.xml").toPath()
.toUri().toString();
TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath(uriDkPro);
MultiClassRepresentationRuleAlgorithm algorithm = new MultiClassRepresentationRuleAlgorithm(10);
List<Instance> instances = new ArrayList<Instance>();
for (File f : korpusFOlder.listFiles()) {
if (!f.getName().endsWith(".xmi"))
continue;
instances.addAll(InstanceCreationFactory.createWindowedInstancesFromUIMA(f, instances.size(), 2, 2,
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", tsd,
new EACL_UIMA_FeatureGenB_E("de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme",
"morphTag", "Entity"),
new WordFeaturegenerator(), new SuffixNGenerator(4), new SuffixNGenerator(3),
new SuffixNGenerator(2), new SuffixNGenerator(1), new PrefixNGenerator(1),
new IsUppercaseFeatureGenerator(), new NGramGenerator(),new WordCategorization(),
new BIO_UIMA_FeatureGen("de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk", "Chunk"),
new EACL_UIMA_FeatureGen("de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "value","NE"),
new POSTagFeatureGenerator("PosValue")));
System.out.println("instances: " + instances.size());
// if(instances.size()>20000)break;
}
int id = 0;
for (Instance i : instances) {
i.setId(id);
id++;
}
// create 5 folds
List<UnstructuredFold> folds = FoldUtil.readInstancesToFold(instances, new Random(13374211), 5);
for (UnstructuredFold fold : folds) {
algorithm.learn(fold.getTrainingset().toArray(new Instance[0]));
// evaluate
List<ALabelling> goldLabels = new ArrayList<>();
List<ALabelling> systemLabels = new ArrayList<>();
for (Instance i : fold.getTestSet()) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
// evaluate this fold
String evaluateToString = new LabelAccuracyEvaluation()
.evaluateToString(goldLabels.toArray(new ALabelling[0]), systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToString);
break;
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment