Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve code quality of token sequence normalization #1872

Merged
merged 8 commits into from
Jul 31, 2024
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.slf4j.LoggerFactory;

import de.jplag.exceptions.LanguageException;
import de.jplag.normalization.TokenStringNormalizer;
import de.jplag.normalization.TokenSequenceNormalizer;
import de.jplag.options.JPlagOptions;

/**
Expand Down Expand Up @@ -259,7 +259,7 @@ private static File createErrorDirectory(String... subdirectoryNames) {
*/
void normalize() {
List<Integer> originalOrder = getOrder(tokenList);
tokenList = TokenStringNormalizer.normalize(tokenList);
tokenList = TokenSequenceNormalizer.normalize(tokenList);
List<Integer> normalizedOrder = getOrder(tokenList);

logger.debug("original line order: {}", originalOrder);
Expand Down
5 changes: 5 additions & 0 deletions core/src/main/java/de/jplag/SubmissionSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public List<Submission> getInvalidSubmissions() {
return invalidSubmissions;
}

/**
* Normalizes the token sequences of all submissions (including basecode). This makes the token sequence invariant to
* dead code insertion and independent statement reordering by removing dead tokens and optionally reordering tokens to
* a deterministic order.
*/
public void normalizeSubmissions() {
if (baseCodeSubmission != null) {
baseCodeSubmission.normalize();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import de.jplag.semantics.Variable;

/**
* Models a multiple edge in the normalization graph. Contains multiple edges.
* Models multiple edges between two nodes in the normalization graph.
*/
class MultipleEdge {
private final Set<Edge> edges;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,28 @@
import de.jplag.semantics.Variable;

/**
* Constructs the normalization graph.
* Token normalization graph, which is a directed graph based on nodes of type {@link Statement} and edges of type
* {@link MultipleEdge}. This class class inherits from {@link SimpleDirectedGraph} to provide a data structure for the
* token sequence normalization.
*/
class NormalizationGraphConstructor {
private final SimpleDirectedGraph<Statement, MultipleEdge> graph;
public class NormalizationGraph extends SimpleDirectedGraph<Statement, MultipleEdge> {

private static final long serialVersionUID = -8407465274643809647L; // generated

private int bidirectionalBlockDepth;
private final Collection<Statement> fullPositionSignificanceIncoming;
private Statement lastFullPositionSignificance;
private Statement lastPartialPositionSignificance;
private final Map<Variable, Collection<Statement>> variableReads;
private final Map<Variable, Collection<Statement>> variableWrites;
private final Set<Statement> inCurrentBidirectionalBlock;
private Statement current;

NormalizationGraphConstructor(List<Token> tokens) {
graph = new SimpleDirectedGraph<>(MultipleEdge.class);
private final transient Collection<Statement> fullPositionSignificanceIncoming;
private transient Statement lastFullPositionSignificance;
private transient Statement lastPartialPositionSignificance;
private final transient Map<Variable, Collection<Statement>> variableReads;
private final transient Map<Variable, Collection<Statement>> variableWrites;
private final transient Set<Statement> inCurrentBidirectionalBlock;
private transient Statement current;

/**
* Creates a new normalization graph.
*/
public NormalizationGraph(List<Token> tokens) {
super(MultipleEdge.class);
bidirectionalBlockDepth = 0;
fullPositionSignificanceIncoming = new ArrayList<>();
variableReads = new HashMap<>();
Expand All @@ -45,12 +52,8 @@ class NormalizationGraphConstructor {
addStatement(builderForCurrent.build());
}

SimpleDirectedGraph<Statement, MultipleEdge> get() {
return graph;
}

private void addStatement(Statement statement) {
graph.addVertex(statement);
addVertex(statement);
this.current = statement;
processBidirectionalBlock();
processFullPositionSignificance();
Expand Down Expand Up @@ -123,10 +126,10 @@ private void processWrites() {
* @param cause the variable that caused the edge, may be null
*/
private void addIncomingEdgeToCurrent(Statement start, EdgeType type, Variable cause) {
MultipleEdge multipleEdge = graph.getEdge(start, current);
MultipleEdge multipleEdge = getEdge(start, current);
if (multipleEdge == null) {
multipleEdge = new MultipleEdge();
graph.addEdge(start, current, multipleEdge);
addEdge(start, current, multipleEdge);
}
multipleEdge.addEdge(type, cause);
}
Expand All @@ -135,4 +138,5 @@ private void addVariableToMap(Map<Variable, Collection<Statement>> variableMap,
variableMap.putIfAbsent(variable, new ArrayList<>());
variableMap.get(variable).add(current);
}

}
11 changes: 8 additions & 3 deletions core/src/main/java/de/jplag/normalization/Statement.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,19 @@
import de.jplag.semantics.CodeSemantics;

/**
* Models statements, which are the nodes of the normalization graph.
* Models statements, which are the nodes of the normalization graph. A statement refers to one or more tokens.
*/
class Statement implements Comparable<Statement> {

private final List<Token> tokens;
private final int lineNumber;
private final CodeSemantics semantics;

/**
* Constructs a new Statement.
* @param tokens the list of tokens that represent this statement.
* @param lineNumber the line number where this statement occurs in the source code.
*/
Statement(List<Token> tokens, int lineNumber) {
this.tokens = Collections.unmodifiableList(tokens);
this.lineNumber = lineNumber;
Expand All @@ -30,8 +35,8 @@ CodeSemantics semantics() {
return semantics;
}

void markKeep() {
semantics.markKeep();
void markAsCritical() {
semantics.markAsCritical();
}

private int tokenOrdinal(Token token) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ class StatementBuilder {
private final List<Token> tokens;
private final int lineNumber;

/**
* Constructs a new StatementBuilder.
* @param lineNumber the line number where the statement starts in the source code.
*/
StatementBuilder(int lineNumber) {
this.lineNumber = lineNumber;
this.tokens = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,45 +1,50 @@
package de.jplag.normalization;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.stream.Collectors;

import org.jgrapht.Graphs;
import org.jgrapht.graph.SimpleDirectedGraph;

import de.jplag.Token;

/**
* Performs token sequence normalization.
*/
public class TokenStringNormalizer {
public final class TokenSequenceNormalizer {

private TokenStringNormalizer() {
private TokenSequenceNormalizer() {
// private constructor for non-instantiability.
}

/**
* Performs token sequence normalization. Tokens representing dead code have been eliminated and tokens representing
* subsequent independent statements have been put in a fixed order. Works by first constructing a Normalization Graph
* and then turning it back into a token sequence.
* subsequent independent statements have been put in a fixed order if sorting is true. Works by first constructing a
* Normalization Graph and then turning it back into a token sequence. For more information refer to the
* <a href="https://doi.org/10.1145/3639478.3643074">corresponding paper</a>
* @param tokens The original token sequence, remains unaltered.
* @return The normalized token sequence as unmodifiable list.
* @return The normalized token sequence.
*/
public static List<Token> normalize(List<Token> tokens) {
SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph = new NormalizationGraphConstructor(tokens).get();
NormalizationGraph graph = new NormalizationGraph(tokens);
propagateCriticalityStatus(graph);
return normalizeWithSorting(tokens, graph);
}

// Add tokens in normalized original order, removing dead tokens
private static List<Token> normalizeWithSorting(List<Token> tokens, NormalizationGraph normalizationGraph) {
List<Token> normalizedTokens = new ArrayList<>(tokens.size());
spreadKeep(normalizationGraph);
PriorityQueue<Statement> roots = normalizationGraph.vertexSet().stream() //
.filter(v -> !Graphs.vertexHasPredecessors(normalizationGraph, v)) //
.collect(Collectors.toCollection(PriorityQueue::new));
while (!roots.isEmpty()) {
PriorityQueue<Statement> newRoots = new PriorityQueue<>();
do {
Statement statement = roots.poll();
if (statement.semantics().keep()) {
if (statement.semantics().isCritical()) {
normalizedTokens.addAll(statement.tokens());
}
for (Statement successor : Graphs.successorListOf(normalizationGraph, statement)) {
Expand All @@ -51,26 +56,29 @@ public static List<Token> normalize(List<Token> tokens) {
} while (!roots.isEmpty());
roots = newRoots;
}
return Collections.unmodifiableList(normalizedTokens);
return normalizedTokens;
}

/**
* Spread keep status to every node that does not represent dead code. Nodes without keep status are later eliminated.
* Spread criticality status to every node that does not represent dead code. Nodes without keep criticality are later
* eliminated (dead nodes). Before calling this method, only the statements that directly affect the behavior are marked
* as critical. After calling this method, this also holds true for statement that (transitively) depend (read/write) on
* the critical ones.
*/
private static void spreadKeep(SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph) {
private static void propagateCriticalityStatus(NormalizationGraph normalizationGraph) {
Queue<Statement> visit = new LinkedList<>(normalizationGraph.vertexSet().stream() //
.filter(tl -> tl.semantics().keep()).toList());
.filter(tl -> tl.semantics().isCritical()).toList());
while (!visit.isEmpty()) {
Statement current = visit.remove();
for (Statement predecessor : Graphs.predecessorListOf(normalizationGraph, current)) { // performance of iteration?
if (!predecessor.semantics().keep() && normalizationGraph.getEdge(predecessor, current).isVariableFlow()) {
predecessor.markKeep();
if (!predecessor.semantics().isCritical() && normalizationGraph.getEdge(predecessor, current).isVariableFlow()) {
predecessor.markAsCritical();
visit.add(predecessor);
}
}
for (Statement successor : Graphs.successorListOf(normalizationGraph, current)) {
if (!successor.semantics().keep() && normalizationGraph.getEdge(current, successor).isVariableReverseFlow()) {
successor.markKeep();
if (!successor.semantics().isCritical() && normalizationGraph.getEdge(current, successor).isVariableReverseFlow()) {
successor.markAsCritical();
visit.add(successor);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/src/test/java/de/jplag/NormalizationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ void testReorderingNormalization() {
void testInsertionReorderingNormalization() {
Assertions.assertIterableEquals(originalTokenString, tokenStringMap.get("SquaresInsertedReordered.java"));
}
}
}
Loading