Commit 9daabd58 authored by Markus Krug's avatar Markus Krug
Browse files

started to add javadoc, added logging

parent 756c4039
......@@ -5,21 +5,44 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.LabelAlphabet;
/**
*
* @author mkrug and dschloer
*
* This class serves the role of a condition-action rule, a condition is
* a feature subset, indiciated as a List of sets and the action is a
* label. The rule itself saves a score to be able to predict
* probabilities
*
*/
public class RepresentationRule {
// coondition
/**
* This List of set serves to save the feature subset in focus, whereby the
* set at index i shows the features that need to be present at position i
* for this rule to macth
*/
private List<Set<Integer>> conditionSet;
// label of the rule
/**
* the action which this rule produces if it is applicable, the string
* analog is saved in the LabelAlphabet
*/
private Integer label;
private int maximumScore;
/**
*
* @param windowSize the amount of columns of features of an instance
* @param instanceArray the incoming features, stored in an array[][]
* @param label the action of this rule
* @param maximumScore
*/
public RepresentationRule(int windowSize, int[][] instanceArray, int label, int maximumScore) {
conditionSet = new ArrayList<Set<Integer>>(windowSize);
......@@ -37,7 +60,13 @@ public class RepresentationRule {
}
}
/**
*
* @param windowSize the amount of columns of features of an instance
* @param features the features stored as points, Point.x is the columns and Point.y the feature
* @param label the action of this rule
* @param maximumScore
*/
public RepresentationRule(int windowSize, List<Point> features, int label, int maximumScore) {
this.label = label;
conditionSet = new ArrayList<Set<Integer>>(windowSize);
......@@ -66,10 +95,17 @@ public class RepresentationRule {
}
}
/**
*
* @return a String that contains a pretty representation of this rule
*/
public String verbalizeRule() {
return toString();
}
/**
* @return gives the String representaion of a rule, saved as Feature1 AND Feature2 => Label
*/
@Override
public String toString() {
......
......@@ -6,35 +6,75 @@ import java.util.List;
import de.uniwue.ls6.datastructure.Instance;
/**
*
* @author mkrug, dschloer
*
* A RulePass contains a set of rules that are applied in sequence to
* classify instances
*/
public class RulePass {
/**
* contains a set of rules that are applied within a pass over the instances
* the rulelist, order by the scores in which they are learned
*/
private List<RepresentationRule> ruleSet;
/**
* Default struct, only creates an empty rulelist
*/
public RulePass() {
super();
ruleSet = new ArrayList<>();
}
/**
* Creates a Pass with a given List of rules
*
* @param ruleSet
* List of Rules, ordered by score
*/
public RulePass(List<RepresentationRule> ruleSet) {
super();
this.ruleSet = ruleSet;
}
/**
* Adds a rule to a given pass
*
* @param rule
* rule to be added to the pass
*/
public void addRule(RepresentationRule rule) {
this.ruleSet.add(rule);
}
/**
* Removes a rule from a given pass
*
* @param rule
* rule to be removed from the rule pass
*/
public void removeRule(RepresentationRule rule) {
this.ruleSet.remove(rule);
}
/**
* returns a view of the rules of this rule pass
*
* @return An unmodifiable List of Rules of this Pass
*/
public List<RepresentationRule> getRuleset() {
return Collections.unmodifiableList(ruleSet);
}
/**
*
* @param ins
* an Learning instance
* @return whether this rulepass has at least one rule that can be applied
* to the Instance
*/
public boolean isApplicable(Instance ins) {
for (RepresentationRule rule : ruleSet) {
if (rule.isApplicable(ins))
......
package de.uniwue.ls6.datastructure;
public abstract class ALabelling {
/**
* Abstract class of a labelling, a labelling could be structured or just a single label
*/
}
......@@ -4,28 +4,50 @@ import java.awt.Point;
import java.util.Arrays;
import java.util.List;
import de.uniwue.ls6.util.MatrixUtil;
import no.uib.cipr.matrix.Matrices;
import no.uib.cipr.matrix.MatrixEntry;
import no.uib.cipr.matrix.sparse.FlexCompColMatrix;
/**
* This class represents a single datapoint to be classified by the machine learning algorithm
* @author mkrug, dschloer
*
*/
public class Instance {
/*
* a datapoint to CLASSIFY
/**
* The array of features
*/
// featureset
private int[][] featureArray;
/**
* The goldlabel this instance carries
*/
private int label;
/**
* unique id for a given learning task
*/
private int id;
/**
* Creates an instance with no features
* @param nrCols the windowsize of the task
* @param nrRows usually the amounf of features stored in this column
* @param id a unique id
*/
public Instance(int nrCols, int nrRows,int id) {
super();
featureArray = new int[nrCols][nrRows];
this.id = id;
}
/**
* Creates an instance with no features
* @param nrCols the windowsize of the task
* @param nrRows usually the amounf of features stored in this column
* @param id a unique id
* @param label the goldlabel of this instance
*/
public Instance(int nrCols, int nrRows, int label,int id) {
super();
featureArray = new int[nrCols][nrRows];
......@@ -33,10 +55,17 @@ public class Instance {
this.id = id;
}
public int getLabel() {
return label;
}
/**
* gets the feature at position (col,row)
* @param col
* @param row
* @param value
*/
public void setFeatureAt(int col, int row, int value) {
featureArray[col][row] = value;
}
......@@ -58,6 +87,11 @@ public class Instance {
// this could be made faster by either a change of representation
// (Map<int->Set or even by calculating the array position by a hash
// function)
/**
* Checks whether this instance has a set of features
* @param features the features that are checked if they are contained in this instance
* @return whether this instance contains all features of features
*/
public boolean containsFeature(List<Point> features) {
outer: for (Point p : features) {
......@@ -75,6 +109,11 @@ public class Instance {
return true;
}
/** Expands this instance clever for the next iteration
*
* @param mappings the Matrixmappings already included in the learning process
* @return the expanded instances in the size of the next kronecker expanded iteration
*/
// perform kronecker expansion should be executed in parallel for many instances
public FlexCompColMatrix expand(List<MatrixMapping> mappings) {
......@@ -117,7 +156,7 @@ public class Instance {
}
// assert that the entries in the expanded matrices are of size x(x-1)
assert (Matrices.cardinality(denseInstanceMatrix) * (Matrices.cardinality(denseInstanceMatrix) - 1) == Matrices
assert (Matrices.cardinality(denseInstanceMatrix) * (Matrices.cardinality(denseInstanceMatrix)) == Matrices
.cardinality(
expanedMatrix)) : "Amount of non null entries of kronecker matrix after instance expansion not correct";
......
......@@ -3,6 +3,14 @@ package de.uniwue.ls6.datastructure;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* LabelAlphabet, each learning task has its own alphabet. This is the mapping
* of String to ints to keep the internal data structures small and calculations
* efficient
*
* @author mkrug, dschloer
*
*/
public class LabelAlphabet {
/*
......@@ -55,12 +63,12 @@ public class LabelAlphabet {
public static String asString() {
StringBuilder sb = new StringBuilder();
sb.append("Alphabetsize: " + featureToIdMap.size()).append("\n");
for(String verboseFeature : featureToIdMap.keySet()){
sb.append(verboseFeature+ "\t" + featureToIdMap.get(verboseFeature) + "\n");
for (String verboseFeature : featureToIdMap.keySet()) {
sb.append(verboseFeature + "\t" + featureToIdMap.get(verboseFeature) + "\n");
}
return sb.toString();
}
......
......@@ -6,19 +6,15 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.uniwue.ls6.datastructure.Instance;
import de.uniwue.ls6.datastructure.LabelAlphabet;
......@@ -29,8 +25,9 @@ import no.uib.cipr.matrix.sparse.FlexCompColMatrix;
public class MatrixUtil {
public static MatrixMapping getMappingForMaximum(MatrixMcMatrixFace matrixface, int maximum,List<MatrixMapping> mappings) {
public static MatrixMapping getMappingForMaximum(MatrixMcMatrixFace matrixface, int maximum,
List<MatrixMapping> mappings, int beamSize) {
final Logger logger = LoggerFactory.getLogger(MatrixUtil.class);
// totally unugly code not even necessary which makes it worse
int formerMatrixSize = matrixface.getTpMatrix().numRows() == matrixface.getTpMatrix().numColumns()
? matrixface.getTpMatrix().numColumns() : -1;
......@@ -54,9 +51,9 @@ public class MatrixUtil {
}
}
System.out.println("Densedimension: "+ matrixMapping.getDenseMatrixDimension());
if (matrixMapping.getDenseMatrixDimension() > 20) {
System.out.println("Too many possible features! We restrict to the best 2500");
if (matrixMapping.getDenseMatrixDimension() > Math.sqrt(beamSize)) {
logger.warn("Too many possible features! We restrict to the best " + beamSize
+ " Amounf of potentially good features " + matrixMapping.getDenseMatrixDimension()*matrixMapping.getDenseMatrixDimension());
matrixMapping = new MatrixMapping(formerMatrixSize);
// sort
......@@ -69,7 +66,7 @@ public class MatrixUtil {
}
});
// add the top 2500 to the mapping
// add the top "beamsize" to the mapping
for (MatrixEntry entry : entries) {
matrixMapping.addEntry(new Point(entry.column(), entry.row()));
}
......@@ -89,7 +86,8 @@ public class MatrixUtil {
int dimension = lastMapping.getDenseMatrixDimension();
Supplier<MatrixMcMatrixFace> matrixConstructor = ()-> new MatrixMcMatrixFace(dimension * dimension, dimension * dimension, label);
Supplier<MatrixMcMatrixFace> matrixConstructor = () -> new MatrixMcMatrixFace(dimension * dimension,
dimension * dimension, label);
BiConsumer<MatrixMcMatrixFace, Instance> accumulator = (MatrixMcMatrixFace expandedMatrix, Instance inst) -> {
FlexCompColMatrix expandedInstance = inst.expand(mappings);
if (inst.getLabel() == label) {
......@@ -103,129 +101,9 @@ public class MatrixUtil {
a.getFpMatrix().add(b.getFpMatrix());
return a;
};
return instances.parallelStream().collect(Collector.of(
matrixConstructor,
accumulator,
join,Collector.Characteristics.UNORDERED
));
// // populate the matrix with the dataset this is expensive!
// List<FlexCompColMatrix> syncList = Collections.synchronizedList(new LinkedList<>());
// instances.parallelStream().forEach((Instance inst) -> {
// // expand
// FlexCompColMatrix expandedInstance = inst.expand(mappings);
//
// syncList.add(expandedInstance);
// // TODO can we use something faster here... currently this results
// // in a speedup of 3...
// synchronized (expandedMatrixFace) {
// // add to kronecker
// if (inst.getLabel() == label) {
// expandedMatrixFace.addToMatrix(expandedMatrixFace.getTpMatrix(), expandedInstance);
// } else {
// expandedMatrixFace.addToMatrix(expandedMatrixFace.getFpMatrix(), expandedInstance);
// }
// }
//
// });
// return expandedMatrixFace;
}
public static MatrixMcMatrixFace performKroneckerExpansionWithIndex(List<MatrixMapping> mappings,
Collection<Instance> instances, int label, Map<Point, Set<Instance>> index) {
MatrixMapping lastMapping = mappings.get(mappings.size() - 1);
int dimension = lastMapping.getDenseMatrixDimension();
MatrixMcMatrixFace expandedMatrixFace = new MatrixMcMatrixFace(dimension * dimension, dimension * dimension,
label);
ExecutorService executor = Executors.newFixedThreadPool(8);
// for each entry of the new matrix do...
for (int col = 0; col < dimension*dimension; col++) {
for (int row = 0; row < dimension*dimension; row++) {
final int column = col;
final int rowFin = row;
Runnable runnable = new Runnable() {
@Override
public void run() {
// recalculate this entry to feature points
// first is to revert the kronecker
List<Point> reversedfeatures = new ArrayList<>();
reversedfeatures.add(new Point(column, rowFin));
revertKroneckerExpansion(lastMapping, reversedfeatures);
// if the mapping does not contain one of those points
// we can
// not assume to find anything
for (Point p : reversedfeatures) {
if (!lastMapping.getInverseMappingMap().containsKey(p))
return;
}
// now determine the features
Set<Point> features = new HashSet<>();
features.addAll(determineFeaturesForIndex(reversedfeatures.get(0), mappings));
features.addAll(determineFeaturesForIndex(reversedfeatures.get(1), mappings));
// now we can access our index and get all instances
// that have
// exactly this feature
Set<Instance> goodSet = new HashSet<>();
for (Point feature : features) {
Set<Instance> set = index.get(feature);
// theres no need for instances to even exist
if (set == null) {
goodSet.clear();
break;
}
if (!goodSet.isEmpty()) {
goodSet.retainAll(set);
} else {
// keep only those that are already contained in
// goodset
for (Instance i : set) {
if (instances.contains(i)) {
goodSet.add(i);
}
}
}
}
// increase the matrix accordingly
// add to kronecker
int tps = 0;
int fps = 0;
for (Instance i : goodSet) {
if (i.getLabel() == label)
tps++;
else {
fps++;
}
}
if (tps == 0 && fps == 0)
return;
synchronized (expandedMatrixFace) {
expandedMatrixFace.getTpMatrix().set(rowFin, column, tps);
expandedMatrixFace.getFpMatrix().set(rowFin, column, fps);
}
}
};
executor.execute(runnable);
}
}
executor.shutdown();
while (!executor.isTerminated()) {
}
return expandedMatrixFace;
// this is done because the matrix is not thread safe!!
return instances.parallelStream()
.collect(Collector.of(matrixConstructor, accumulator, join, Collector.Characteristics.UNORDERED));
}
public static List<Point> determineFeaturesForIndex(Point index, List<MatrixMapping> mappings) {
......@@ -248,35 +126,34 @@ public class MatrixUtil {
public static void revertKroneckerExpansion(MatrixMapping previousMapping, List<Point> reversedfeatures) {
List<Point> toAdd = new ArrayList<Point>();
Iterator<Point> iterator = reversedfeatures.iterator();
while(iterator.hasNext()){
while (iterator.hasNext()) {
Point p = iterator.next();
iterator.remove();
// revert the kronecker expansion step this generates 2 points
int sizeBeforeExpansion = (int) previousMapping.getDenseMatrixDimension();
int xBefore1 = (int) Math.floor(p.x / sizeBeforeExpansion);
int xBefore2 = p.x % sizeBeforeExpansion;
int yBefore1 = (int) Math.floor(p.y / sizeBeforeExpansion);
int yBefore2 = p.y % sizeBeforeExpansion;
Point firstBack = new Point(xBefore1, yBefore1);
Point secondBack = new Point(xBefore2, yBefore2);
toAdd.add(secondBack);
toAdd.add(firstBack);
// assert that the recalculated points are contained in the mapping,
// this also guarantes the dimension is respected
assert (previousMapping.getInverseMappingMap().containsKey(firstBack) && previousMapping
.getInverseMappingMap().containsKey(secondBack)) : "Reverted Points not part of the mapping";
// assure no point is null
assert (secondBack != null && firstBack != null) : "One of the reverted points is null";
}
reversedfeatures.addAll(toAdd);
}
private static void revertMapping(MatrixMapping currentMapping, List<Point> reversedfeatures) {
......
......@@ -11,6 +11,9 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import de.uniwue.ls6.algorithm.datastructure.RepresentationRule;
import de.uniwue.ls6.algorithm.datastructure.RulePass;
import de.uniwue.ls6.datastructure.ALabelling;
......@@ -26,12 +29,16 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
List<RulePass> passes;
private int goldLabel;
private int otherLabel;
private Map<Point, Set<Instance>> index;
private int beamSize;
final Logger logger = LoggerFactory.getLogger(BinaryRepresentationRuleLearningAlgorithm.class);
public BinaryRepresentationRuleLearningAlgorithm(int goldLabel, int otherLabel) {
public BinaryRepresentationRuleLearningAlgorithm(int goldLabel, int otherLabel,int beamSize) {
passes = new LinkedList<RulePass>();
this.goldLabel = goldLabel;
this.otherLabel = otherLabel;
this.beamSize = beamSize;
}
public void learn(Instance... instances) {
......@@ -43,13 +50,10 @@ public class BinaryRepresentationRuleLearningAlgorithm implements IRepresentatio
int passIndex = 0;
int currentGoldIndex = goldLabel;