Commit b3de3348 authored by mak28ma's avatar mak28ma
Browse files

including results

parent e0a76167
......@@ -164,36 +164,38 @@ public class EntityAccuracyEvaluation implements IEvaluation {
beg = i;
}
// else if (split[0].startsWith("O")) {
// if (inEntity) {
// // close the previous entity
// entityMap.put(new Point(beg, i - 1), labels[i - 1].getStringLabel().split(typeSplitter)[1]);
// }
// inEntity = false;
//
// }
// else if (split[0].startsWith("O")) {
// if (inEntity) {
// // close the previous entity
// entityMap.put(new Point(beg, i - 1), labels[i -
// 1].getStringLabel().split(typeSplitter)[1]);
// }
// inEntity = false;
//
// }
else if (split[0].startsWith("E")) {
if (inEntity){
//build that entity
entityMap.put(new Point(beg, i), labels[i].getStringLabel().split(typeSplitter)[1]);
if (inEntity) {
// build that entity
entityMap.put(new Point(beg, i), openedS);
}
inEntity = false;
inEntity = false;
}
}
//possibly there is a last entity
// possibly there is a last entity
if (inEntity) {
// add it
entityMap.put(new Point(beg, labels.length - 1),openedS);
entityMap.put(new Point(beg, labels.length - 1), openedS);
}
}
private void createIOBEntities(ALabelling[] labels, Map<Point, String> entityMap) {
int beg = -1;
boolean inEntity = false;
String openedS = "";
for (int i = 0; i < labels.length; i++) {
ALabelling labelling = labels[i];
String stringLabel = labelling.getStringLabel();
......@@ -213,9 +215,11 @@ public class EntityAccuracyEvaluation implements IEvaluation {
}
inEntity = true;
beg = i;
openedS = split[1];
} else if (split[0].startsWith("I")) {
if (!inEntity) {
beg = i;
openedS = split[1];
}
// else {
// // check if we need to create an annotation because the
......@@ -226,7 +230,7 @@ public class EntityAccuracyEvaluation implements IEvaluation {
// String currentLabel = labels[i -
// 1].getStringLabel().split(typeSplitter)[1];
// if (previousLabel != currentLabel) {
// beg=i;
// beg = i;
// entityMap.put(new Point(beg, i - 1), labels[i -
// 1].getStringLabel().split(typeSplitter)[1]);
// }
......@@ -238,7 +242,7 @@ public class EntityAccuracyEvaluation implements IEvaluation {
if (inEntity) {
// close the previous entity
entityMap.put(new Point(beg, i - 1), labels[i - 1].getStringLabel().split(typeSplitter)[1]);
entityMap.put(new Point(beg, i - 1), openedS);
}
inEntity = false;
......
package de.uniwue.ls6.rulelearning.evaluation.fold;
import java.awt.Point;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
......@@ -41,4 +42,36 @@ public class FoldUtil {
return folds;
}
public static List<UnstructuredFileFold> readInstancesToFold(List<File> files, Random random, int amountFolds,boolean dafuq) {
if (amountFolds <= 1)
throw new IllegalArgumentException("Needs at least 2 folds!");
List<UnstructuredFileFold> folds = new ArrayList<>(amountFolds);
if (random != null) {
Collections.shuffle(files, random);
}
// create the folds
int foldSize = files.size() / amountFolds;
for (int i = 0; i < amountFolds; i++) {
List<File> trainingSet = new ArrayList<>();
List<File> testSet = new ArrayList<>();
Point foldInterval = new Point(i * foldSize, i * foldSize + foldSize);
for (int k = 0; k < files.size(); k++) {
if (k >= foldInterval.x && k < foldInterval.y) {
testSet.add(files.get(k));
} else {
trainingSet.add(files.get(k));
}
}
folds.add(new UnstructuredFileFold(trainingSet, testSet));
}
return folds;
}
}
package de.uniwue.ls6.rulelearning.evaluation.fold;
import java.io.File;
import java.util.List;
public class UnstructuredFileFold {
protected List<File> trainingset;
protected List<File> testSet;
public UnstructuredFileFold(List<File> trainingset, List<File> testSet) {
super();
this.trainingset = trainingset;
this.testSet = testSet;
}
public List<File> getTrainingset() {
return trainingset;
}
public void setTrainingset(List<File> trainingset) {
this.trainingset = trainingset;
}
public List<File> getTestSet() {
return testSet;
}
public void setTestSet(List<File> testSet) {
this.testSet = testSet;
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
public class BE_CONNL_NE_UIMA_FeatureGenLabel extends AFeatureGenerator {
int prefixLen;
String typeS;
String featureId;
private String label;
public BE_CONNL_NE_UIMA_FeatureGenLabel(String typeS, String labelId, String featureId,String label) {
super(labelId);
this.typeS = typeS;
this.featureId = featureId;
this.label = label;
}
@Override
public String[] generateFeatures(AnnotationFS token) {
Type type = token.getCAS().getTypeSystem().getType(typeS);
// get the end of the previous token
AnnotationFS lastTok = null;
FSIterator<AnnotationFS> iterator = token.getCAS().getAnnotationIndex(token.getType()).iterator();
iterator.moveTo(token);
iterator.moveToPrevious();
if (iterator.hasNext()) {
lastTok = iterator.get();
}
for (AnnotationFS anno : token.getCAS().getAnnotationIndex(type)) {
String label = anno.getFeatureValueAsString(anno.getType().getFeatureByBaseName(featureId));
if(anno.getBegin()==token.getBegin() && anno.getEnd()==token.getEnd()&& label.equals(this.label)){
return new String[]{"BE-" + label};
}
if(anno.getBegin()==token.getBegin() && label.equals(this.label)){
return new String[]{"B-" + label};
}
if(anno.getEnd()==token.getEnd() &&label.equals(this.label)){
return new String[]{"E-" + label};
}
}
return new String[] { "O-" };
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationFS;
public class CMU_LineFeats extends AFeatureGenerator {
private AnnotationFS prevToken = null;
public CMU_LineFeats() {
super("LineFeats");
}
@Override
public String[] generateFeatures(AnnotationFS token) {
String documentText = token.getCAS().getDocumentText();
Matcher matcher = Pattern.compile(".*?\\n").matcher(documentText);
List<String> features = new ArrayList<>();
int lineIndex = 0;
while (matcher.find()) {
lineIndex++;
String line = matcher.group();
if (matcher.start() < token.getBegin() && matcher.end() > token.getEnd()) {
String[] split = line.split(" ");
features.add("AmountOfTokensInLine=" + split.length);
features.add("LineIndex=" + lineIndex);
// build the simplified string of the first letters of the
// tokens
String simS = "";
for (String s : split) {
if (s.matches("[A-Z][a-z]+")) {
simS += "C";
}
else if (s.matches("[A-Z]+")) {
simS += "U";
} else if (s.matches("[0-9]+")) {
simS += "D";
} else if (s.matches("[a-z]+")) {
simS += "c";
}
else {
if (s.length() > 1) {
simS += s.substring(0, 1);
}
}
}
// System.out.println(simS);
features.add("SimplifiedLineRow=" + simS);
}
}
if (prevToken != null) {
// System.out.println(prevToken.getCoveredText() + "\t" +
// token.getCoveredText());
// convert the text between the words
String sub = documentText.substring(Math.min(prevToken.getEnd() + 1, token.getBegin()), token.getBegin());
// convert this string
String simplifiedString = "SPACE_BEFORE_TOKEN=";
if (sub.isEmpty()) {
simplifiedString += "NONE";
} else {
for (Character c : sub.toCharArray()) {
if(c==' '){
simplifiedString+="B";
}
else if(c=='\n'){
simplifiedString+="N";
}
else if(c=='\t'){
simplifiedString+="T";
}
else{
//System.out.println(c + " " + ((int)c));
}
}
}
features.add(simplifiedString);
}
prevToken = token;
return features.toArray(new String[0]);
}
}
......@@ -18,22 +18,28 @@ public class CMU_NextTokenLineBreak extends AFeatureGenerator {
FSIterator<AnnotationFS> iterator = token.getCAS().getAnnotationIndex(token.getType()).iterator();
iterator.moveTo(token);
iterator.moveToNext();
if (iterator.isValid()) {
AnnotationFS annotationFS = iterator.get();
String textBetween = token.getCAS().getDocumentText().substring(
Math.min(annotationFS.getEnd(), token.getBegin()),
Math.max(annotationFS.getEnd(), token.getBegin()));
if (textBetween.contains("\n") || textBetween.contains("\r")) {
return new String[] { super.featureIdentifier + "=" + "TRUE" };
} else {
return new String[] { super.featureIdentifier + "=" + "FALSE" };
}
String sub = documentText.substring(token.getEnd(),Math.min(documentText.length(), token.getEnd()+5));
if(sub.contains("\n") || sub.contains("\r")){
return new String[] { super.featureIdentifier + "=" + "TRUE" };
}
// if (iterator.isValid()) {
// AnnotationFS annotationFS = iterator.get();
//
// String textBetween = token.getCAS().getDocumentText().substring(
// Math.min(annotationFS.getEnd(), token.getBegin()),
// Math.max(annotationFS.getEnd(), token.getBegin()));
//
// if (textBetween.contains("\n") || textBetween.contains("\r")) {
// return new String[] { super.featureIdentifier + "=" + "TRUE" };
// } else {
// return new String[] { super.featureIdentifier + "=" + "FALSE" };
// }
//
// }
return new String[0];
}
......
......@@ -27,18 +27,6 @@ public class CMU_PersonRegex extends AFeatureGenerator {
//dot dot last name
personPatterns.add(Pattern.compile("(([A-Z]\\.)\\s*)+([A-Z][a-z]+)"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+) will present"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+)"));
// personPatterns.add(Pattern.compile("[\\n\\r]([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+)[\\n\\r]"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+) will speak"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+) will talk"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+) will be speaking"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+) will give a talk"));
// personPatterns.add(Pattern.compile("([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+).*discuss"));
// personPatterns.add(Pattern.compile("speaker is ([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+).*discuss"));
// personPatterns.add(Pattern.compile("speaker is ([A-Z][\\w-]*(\\.)?(\\s+[A-Z][\\w-]*)+).*talking"));
// personPatterns.add(Pattern.compile("([A-Z^\\n\\r]+[a-z^\\n\\r]*\\.?)(\\s+([A-Z^\\n\\r]+[a-z^\\n\\r]*\\.?))+"));
}
@Override
......
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.text.AnnotationFS;
public class CMU_RoomFeats extends AFeatureGenerator {
public CMU_RoomFeats() {
super("RoomFeats");
}
@Override
public String[] generateFeatures(AnnotationFS token) {
String documentText = token.getCAS().getDocumentText();
List<String> features = new ArrayList<>();
if(token.getCoveredText().matches("[0-9][0-9][0-9][0-9]")){
features.add("TokFeat=Is4DigitNumber");
}
if(token.getCoveredText().matches("[0-9][0-9][0-9]")){
features.add("TokFeat=Is3DigitNumber");
}
if(token.getCoveredText().matches("[A-Z]+([a-z])([A-Z])?[0-9]*")){
features.add("TokFeat=MixNumberAndChars");
}
if(token.getCoveredText().matches("[0-9]+[A-Z]*([a-z]*)([A-Z])?")){
features.add("TokFeat=MixCharsAndNumbers");
}
return features.toArray(new String[0]);
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
public class IOB_CONNL_NE_UIMA_FeatureGenLabel extends AFeatureGenerator {
int prefixLen;
String typeS;
String featureId;
String slot;
public IOB_CONNL_NE_UIMA_FeatureGenLabel(String typeS, String labelId, String featureId, String slot) {
super(labelId);
this.typeS = typeS;
this.featureId = featureId;
this.slot = slot;
}
@Override
public String[] generateFeatures(AnnotationFS token) {
Type type = token.getCAS().getTypeSystem().getType(typeS);
// get the end of the previous token
AnnotationFS lastTok = null;
FSIterator<AnnotationFS> iterator = token.getCAS().getAnnotationIndex(token.getType()).iterator();
iterator.moveTo(token);
iterator.moveToPrevious();
if (iterator.hasNext()) {
lastTok = iterator.get();
}
AnnotationFS lastGoldAnno = null;
for (AnnotationFS anno : token.getCAS().getAnnotationIndex(type)) {
String label = anno.getFeatureValueAsString(anno.getType().getFeatureByBaseName(featureId));
if (anno.getBegin() == token.getBegin() && anno.getEnd() >= token.getEnd() && label.equals(slot)) {
// here it is either B or I
if (lastGoldAnno == null || lastTok == null) {
// I
return new String[] { "I-" + label };
} else {
if (lastGoldAnno.getEnd() == lastTok.getEnd()) {
// B
// check if they are of the same type
String prevLabel = lastGoldAnno
.getFeatureValueAsString(anno.getType().getFeatureByBaseName(featureId));
if (prevLabel.equals(label)) {
return new String[] { "B-" + label };
}
else{
return new String[] { "I-" + label };
}
}
}
return new String[] { "I-" + label };
} else if (token.getBegin() > anno.getBegin() && token.getEnd() <= anno.getEnd() &&label.equals(slot)) {
return new String[] { "I-" + label };
}
lastGoldAnno = anno;
}
return new String[] { "O-" };
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationFS;
public class NextWordIsUppercaseFeatureGenerator2 extends AFeatureGenerator {
public NextWordIsUppercaseFeatureGenerator2() {
super("NextIsUC");
}
public static final String LOWERCASE = "Lowercase";
public static final String UPPERCASE = "Uppercase";
@Override
public String[] generateFeatures(AnnotationFS token) {
FSIterator<AnnotationFS> iterator = token.getCAS().getAnnotationIndex(token.getType()).iterator();
iterator.moveTo(token);
if (iterator.hasNext()) {
iterator.moveToNext();
if (iterator.hasNext()) {
AnnotationFS annotationFS = iterator.get();
if (annotationFS != null) {
return new String[] { Character.isUpperCase(annotationFS.getCoveredText().charAt(0))
? super.featureIdentifier + "=" + UPPERCASE : super.featureIdentifier + "=" + LOWERCASE };
}
}
}
return new String[0];
}
}
......@@ -160,9 +160,9 @@ public class InstanceCreationFactory {
}
public static List<Instance> createWindowedInstancesFromUIMAFiles(List<File> files, int amountInstance, int startIndex,
int leftWindowsize, int rightWindowSize, String tokentypeS, TypeSystemDescription typesystem,
AFeatureGenerator goldGenerator, AFeatureGenerator... generators)
public static List<Instance> createWindowedInstancesFromUIMAFiles(List<File> files, int amountInstance,
int startIndex, int leftWindowsize, int rightWindowSize, String tokentypeS,