Commit bdcbf4f8 authored by Markus Krug's avatar Markus Krug
Browse files

update scores

parent bb0ac6f4
......@@ -23,9 +23,13 @@ public class CMU_LineStartsWithToken extends AFeatureGenerator{
String line = matcher.group();
if(matcher.start() <token.getBegin() && matcher.end()>token.getEnd()){
String firstTok = line.split(" ")[0];
return new String[]{super.featureIdentifier+"="+firstTok};
}
}
return new String[0];
}
......
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.text.AnnotationFS;
public class CMU_LocationRegex extends AFeatureGenerator {
List<Pattern> locationPatterns;
public CMU_LocationRegex() {
super("LocationRegex");
locationPatterns = new ArrayList<>();
// prename last name
locationPatterns
.add(Pattern.compile("(Hall|room|Room|Building|Auditorium|Library|building|Avenue|avenue|Institute|Wing)"));
}
@Override
public String[] generateFeatures(AnnotationFS token) {
String documentText = token.getCAS().getDocumentText();
int nrInDoc = 1;
List<String> features = new ArrayList<>();
for (Pattern p : locationPatterns) {
Matcher matcher = p.matcher(documentText);
while (matcher.find()) {
if (token.getEnd() == matcher.end()) {
features.add(super.featureIdentifier + nrInDoc + "=" + "E-LocationRegex");
}
nrInDoc++;
}
}
return features.toArray(new String[0]);
}
}
......@@ -2,9 +2,9 @@ package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator;
import org.apache.uima.cas.text.AnnotationFS;
public class WordFeaturegenerator extends AFeatureGenerator{
public class TextFeaturegenerator extends AFeatureGenerator{
public WordFeaturegenerator() {
public TextFeaturegenerator() {
super("Text");
}
......
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.chunk;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.AFeatureGenerator;
public class CMU_ChunkWordFeats extends AFeatureGenerator {
private AnnotationFS prevToken = null;
private Set<String> firstnames;
private Set<String> lastnames;
private Set<String> titles;
public CMU_ChunkWordFeats() throws FileNotFoundException {
super("ChunkLineFeats");
FileInputStream isFemale = new FileInputStream(new File("resources/female_names.txt"));
FileInputStream isMale = new FileInputStream(new File("resources/male_names.txt"));
FileInputStream isNN = new FileInputStream(new File("resources/ExtraNachnamen.txt"));
FileInputStream isAT = new FileInputStream(new File("resources/academicTitles.txt"));
firstnames = new HashSet<>();
lastnames = new HashSet<>();
titles = new HashSet<>();
read(isFemale, firstnames);
read(isMale, firstnames);
read(isNN, lastnames);
read(isAT, titles);
}
private void read(FileInputStream stream, Set<String> set) {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line;
try {
while ((line = reader.readLine()) != null) {
set.add(line.trim());
String[] split = line.trim().split(" ");
}
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public String[] generateFeatures(AnnotationFS token) {
List<String> features = new ArrayList<>();
Type posType = token.getCAS().getTypeSystem()
.getType("de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS");
Feature posTag = posType.getFeatureByBaseName("PosValue");
List<AnnotationFS> tokens = new ArrayList<>();
for (AnnotationFS tok : token.getCAS().getAnnotationIndex(posType)) {
tokens.add(tok);
}
// find the according index
int index = 0;
int indexEnd = -0;
int iterIndex = 0;
for (AnnotationFS tok : tokens) {
if (tok.getBegin() == token.getBegin()) {
index = iterIndex;
}
if (tok.getEnd() == token.getEnd()) {
indexEnd = iterIndex;
}
iterIndex++;
}
// use tokens in window
int in = 1;
for (int tok = index; tok <= indexEnd; tok++) {
AnnotationFS annotationFS = tokens.get(tok);
String pos = annotationFS.getFeatureValueAsString(posTag);
features.add("TokenIn" + in + "=" + annotationFS.getCoveredText());
features.add("TokenInPOS" + in + "=" + pos);
// posTag
in++;
}
// before
int before = 1;
for (int tokBefore = index - 1; tokBefore >= Math.max(index - 4, 0); tokBefore--) {
AnnotationFS annotationFS = tokens.get(tokBefore);
String pos = annotationFS.getFeatureValueAsString(posTag);
features.add("TokenBefore" + before + "=" + annotationFS.getCoveredText());
features.add("TokenBeforePOS" + before + "=" + pos);
// posTag
before++;
}
// after
int after = 1;
for (int tokAfter = index + 1; tokAfter <= Math.min(index + 4, tokens.size() - 1); tokAfter++) {
AnnotationFS annotationFS = tokens.get(tokAfter);
String pos = annotationFS.getFeatureValueAsString(posTag);
features.add("TokenAfter" + after + "=" + annotationFS.getCoveredText());
features.add("TokenAfterPOS" + after + "=" + pos);
after++;
}
// use the lists
String[] tokensOfChunk = token.getCoveredText().split(" ");
String listEntries = "";
for (String s : tokensOfChunk) {
String listMatch = determineListMatch(s);
listEntries += listMatch;
}
features.add("ListMatch" + "=" + listEntries);
// add the simplified string
String wordBegs = "";
for (String s : tokensOfChunk) {
if (s.length() > 0) {
wordBegs += s.charAt(0);
}
else{
wordBegs+="#";
}
}
String unifyString = unifyString(wordBegs);
features.add("Wordbegins" + "=" + wordBegs);
features.add("WordbeginsUnified" + "=" + unifyString);
// new ideas
// next verb in line
// System.out.println(features);
return features.toArray(new String[0]);
}
private String unifyString(String coveredText) {
String refinedString = "";
for (Character c : coveredText.toCharArray()) {
if (c.toString().matches("[a-z]")) {
refinedString += "c";
} else if (c.toString().matches("[A-Z]")) {
refinedString += "C";
} else if (c.toString().matches("[0-9]")) {
refinedString += "D";
} else {
refinedString += c.toString();
}
}
return refinedString;
}
private String determineListMatch(String s) {
if (firstnames.contains(s)) {
return "<FIRST_NAME>";
}
if (firstnames.contains(s)) {
return "<FIRST_NAME>";
}
if (lastnames.contains(s)) {
return "<LAST_NAME>";
}
if (titles.contains(s)) {
return "<TITLE>";
}
return "<Not_In_List>";
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.chunk;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.cas.text.AnnotationFS;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.AFeatureGenerator;
public class PartInListFeatureGenerator extends AFeatureGenerator {
/*
* only returns true if the token is contained exactly
*/
Set<String> listEntries;
Set<String> longEntries;
public PartInListFeatureGenerator(String featureIdentidier, InputStream inStream) {
super(featureIdentidier);
listEntries = new HashSet<String>();
longEntries = new HashSet<>();
BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
String line;
try {
while ((line = reader.readLine()) != null) {
listEntries.add(line.trim());
String[] split = line.trim().split(" ");
if (split.length > 1) {
longEntries.add(line.trim());
}
}
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public String[] generateFeatures(AnnotationFS token) {
List<String> features = new ArrayList<>();
String[] split = token.getCoveredText().split("\n");
int index=0;
for(String s : split){
if(listEntries.contains(s)){
features.add(super.featureIdentifier+index+"="+ "IN_LIST");
}
}
// if(!features.isEmpty()){
// System.out.println(token.getCoveredText());
// System.out.println(features);
// }
return features.toArray(new String[0]);
}
}
package de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.chunk;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.cas.text.AnnotationFS;
import de.uniwue.ls6.rulelearning.instanceloading.featuregenerator.AFeatureGenerator;
public class PartOfFeaturegenerator extends AFeatureGenerator{
public PartOfFeaturegenerator() {
super("PartOf");
}
@Override
public String[] generateFeatures(AnnotationFS token) {
String[] split = token.getCoveredText().split("\n");
List<String> features = new ArrayList<>();
int index=0;
for(String s : split){
features.add(super.featureIdentifier+index+"="+s);
index++;
}
return features.toArray(new String[0]);
}
}
......@@ -191,7 +191,6 @@ public class InstanceCreationFactory {
Queue<List<String>> instQueue = new LinkedList<>();
AnnotationFS token = featList.get(i).getToken();
String goldFeat = goldGenerator.generateFeatures(token)[0];
// System.out.println(token.getCoveredText() + "\t" + goldFeat);
for (int k = i - leftWindowsize; k <= i + rightWindowSize; k++) {
......
Dr.
Jeffrey
D.
Hermes
Professor
John
Skvoretz
Andrew
Gault
Mary
Francis
McLaughlin
Jessie
Ramey
Judi
Mancuso
Professors
Rob
A.
Rutenbar
Wojciech
Maly
Joe
Mertz
Professor
Warren
Baker
Professor
Steven
Granick
Steve
Granick
Richard
Maddox
FARRO
F.
RADJY,
PH.D.
Dr.
Franklyn
G.
Prendergast
Professor
Sara
Kiesler
Gary
K.
Fedder
Dr.
R.
J.
(Bob)
Pangborn
Laura
Petitto
Patricia
Brooks
Stephanie
Shaw
Professor
Shaw
Professor
Shaw
Professor
Shaw
Woody
Vasulka
Vasulka
Vasulka
MERCE
CUNNINGHAM
Joyce
Scott
Joyce
Scott
Ms.
Scott
Steven
Kurtz
Mr.
Kurtz
R
A
V
I
K
I
R
A
N
James
Demmel
George
W.
Cobb
George
W.
Cobb
Bruce
Sherwood
David
Banks
David
Banks
Larisa
Naples
Karen
Schriver
Professor
Manfred
Paul
Professor
Manfred
Paul
Joy
Mountford
Joy
Mountford
Maxion
Alex
Rudnicky
Alexander
I.
Rudnicky
Emilie
Roth
Emily
Roth
Emilie
Roth
Richard
M
Young
John
Gould
JOHN
GOULD
John
Gould
JOHN
D.
GOULD
Alex
Waibel
Jolene
Galegher
Robert
Stevens
Robert
Stevens
Robert
Stevens
Tom
Mitchell
Tom
M.
Mitchell
Randy
Pausch
Randy
Pausch
Randy
Pausch
Randy
Pausch
Ralph
D.
Hill
Ralph
Hill
Tom
Neuendorffer
Michael