Commit 768779d1 authored by mak28ma's avatar mak28ma
Browse files

annotate

parent 22a65036
package test;
import java.awt.Point;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
import org.xml.sax.SAXException;
import de.uniwue.ls6.datastructure.ALabelling;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.SimpleLabelling;
import de.uniwue.ls6.rulelearning.algorithm.impl.MultiClassOneVsAllRepresentationRuleAlgorithm;
import de.uniwue.ls6.rulelearning.evaluation.eval.EEntityEvaluationsScheme;
import de.uniwue.ls6.rulelearning.evaluation.eval.EntityAccuracyEvaluation;
import de.uniwue.ls6.rulelearning.evaluation.eval.LabelAccuracyEvaluation;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.BE_CONNL_NE_UIMA_FeatureGen;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.CMU_LineStartsWithToken;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.CMU_NextTokenLineBreak;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.CMU_PersonRegex;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.CMU_PrevTokenLineBreak;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.CMU_TimeRegex;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordCategorizationNoNGram;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.WordFeaturegenerator;
import de.uniwue.ls6.rulelearning.instanceloading.io.InstanceCreationFactory;
public class FirstTestEACLBE5 {
public static void main(String[] args) throws Exception {
File korpusFOlder = new File("X:\\owncloud\\paper_RegelLernen\\manualPreprocessed");
File out = new File("X:\\owncloud\\paper_RegelLernen\\annotated");
String uriDkPro = new File("X:\\owncloud\\paper_RegelLernen\\TypeSystemDK.xml").toPath()
.toUri().toString();
TypeSystemDescription tsd = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath(uriDkPro);
MultiClassOneVsAllRepresentationRuleAlgorithm algorithm = new MultiClassOneVsAllRepresentationRuleAlgorithm(
200);
algorithm.skipTrainingForLabel("O-");
// algorithm.setUseDropOut(new Random(13374211), 0.7);
List<Instance> instancesTrain = new ArrayList<Instance>();
List<Instance> instancesTest = new ArrayList<Instance>();
FileInputStream isFemale = new FileInputStream(new File("resources/female_names.txt"));
FileInputStream isMale = new FileInputStream(new File("resources/male_names.txt"));
FileInputStream isNN = new FileInputStream(new File("resources/ExtraNachnamen.txt"));
FileInputStream isAT = new FileInputStream(new File("resources/academicTitles.txt"));
File[] listFiles = korpusFOlder.listFiles();
List<File> asList = Arrays.asList(listFiles);
Collections.shuffle(asList, new Random(13374211));
int lenFold = (int) (listFiles.length * 0.8);
List<File> trainData = new ArrayList<>();
List<File> testData = new ArrayList<>();
for (int i = 0; i < listFiles.length; i++) {
if (i < lenFold) {
trainData.add(asList.get(i));
} else {
testData.add(asList.get(i));
}
}
generateInstances(tsd, instancesTrain, trainData);
generateInstances(tsd, instancesTest, testData);
System.out.println("TrainData: " + trainData.size());
System.out.println("TestData: " + testData.size());
algorithm.learn(instancesTrain.toArray(new Instance[0]));
// evaluate
List<ALabelling> goldLabels = new ArrayList<>();
List<ALabelling> systemLabels = new ArrayList<>();
for (Instance i : instancesTest) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
// evaluate this fold
System.out.println("Evaluate for: DO: 70% BS: 500 cutoff 0");
String evaluateToString = new LabelAccuracyEvaluation().evaluateToString(goldLabels.toArray(new ALabelling[0]),
systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToString);
evaluate(algorithm, instancesTest);
// annotate
System.out.println("Annotate");
for (File f : testData) {
List<Instance> instancesF = new ArrayList<>();
generateInstances(tsd, instancesF, Arrays.asList(new File[] { f }));
systemLabels = new ArrayList<>();
for (Instance i : instancesF) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
EntityAccuracyEvaluation entityAccuracyEvaluation = new EntityAccuracyEvaluation(
EEntityEvaluationsScheme.BE, "-");
Map<Point, String> entitiesSystem = entityAccuracyEvaluation
.convertLabellingToEntityLabelling(systemLabels.toArray(new ALabelling[0]));
// need tokens as well
CAS cas = CasCreationUtils.createCas(tsd, null, null);
Type tokType = cas.getTypeSystem().getType("de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS");
XmiCasDeserializer.deserialize(new FileInputStream(f), cas);
List<AnnotationFS> tokens = new ArrayList<>();
for (AnnotationFS tok : cas.getAnnotationIndex(tokType)) {
tokens.add(tok);
}
// now create the annos
Type typeSys = cas.getTypeSystem().getType("de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf");
Feature featSys = typeSys.getFeatureByBaseName("term");
for (Point entity : entitiesSystem.keySet()) {
AnnotationFS fs = cas.createAnnotation(typeSys, tokens.get(entity.x).getBegin(),
tokens.get(entity.y).getEnd());
fs.setFeatureValueFromString(featSys, entitiesSystem.get(entity));
cas.addFsToIndexes(fs);
}
XmiCasSerializer.serialize(cas, new FileOutputStream(new File(out.getAbsolutePath()+"/" + f.getName())));
}
}
private static void generateInstances(TypeSystemDescription tsd, List<Instance> instancesTrain,
List<File> trainData) throws ResourceInitializationException, SAXException, IOException {
instancesTrain.addAll(InstanceCreationFactory.createWindowedInstancesFromUIMAFiles(trainData, 100000,
instancesTrain.size(), 3, 3, "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", tsd,
new BE_CONNL_NE_UIMA_FeatureGen("de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme",
"Entity", "morphTag"),
new WordFeaturegenerator(), new CMU_PersonRegex(), new WordCategorizationNoNGram(), new CMU_TimeRegex(),
new CMU_LineStartsWithToken(), new CMU_PrevTokenLineBreak(), new CMU_NextTokenLineBreak()));
}
private static void evaluate(MultiClassOneVsAllRepresentationRuleAlgorithm algorithm,
List<Instance> instancesTestA) {
List<ALabelling> goldLabels = new ArrayList<>();
List<ALabelling> systemLabels = new ArrayList<>();
for (Instance i : instancesTestA) {
goldLabels.add(new SimpleLabelling(i.getLabel(), 0));
systemLabels.add(algorithm.apply(i));
}
// evaluate this fold
String evaluateToString = new LabelAccuracyEvaluation().evaluateToString(goldLabels.toArray(new ALabelling[0]),
systemLabels.toArray(new ALabelling[0]));
EntityAccuracyEvaluation entityAccuracyEvaluation = new EntityAccuracyEvaluation(EEntityEvaluationsScheme.BE,
"-");
String evaluateToStringEntity = entityAccuracyEvaluation.evaluateToString(goldLabels.toArray(new ALabelling[0]),
systemLabels.toArray(new ALabelling[0]));
System.out.println(evaluateToStringEntity);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment