Compute Source Code Similarity
I have tried to analyze source code by various methods recently.
In this post, I will show you my artifact - Compute source code similarity based on Jaccard Similarity Coefficient.
Jaccard Similarity Coefficient is an common and quick techniqueto compute similarity between 2 documents.
You can see more details in Jaccard index or MinHash.
My similarity computation strategy is below:
- Calculate hash value for each line for each files.
- Create set which contains hashes calculated in the above process for the each files.
- Compute Jaccard Similarity Coefficient for all combinations of the files.
Source Code
Now I can show you code snippets for compute source code similarity based on Jaccard Similarity Coefficient.Jaccard Simlarity
package com.dukesoftware.utils.text;
import java.util.HashSet;
import java.util.Set;
/**
* Jaccard similarity coefficient http://en.wikipedia.org/wiki/MinHash
*/
public class JaccardSimlarity<T> {
private final Set<T> intersect = new HashSet<>();
private final Set<T> union = new HashSet<>();
public double compute(Set<T> set1, Set<T> set2)
{
intersect.clear();
intersect.addAll(set1);
intersect.retainAll(set2);
union.clear();
union.addAll(set1);
union.addAll(set2);
return (double)intersect.size()/(double)union.size();
}
}
Compute Hashes For Source Files And Compute Similarity Base On Jaccard Index
package com.dukesoftware.utils.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import static com.dukesoftware.utils.text.Utils.*;
public class SimilarityByJaccardIndexBasedOnSimpleHash {
public static void main(String[] args) throws IOException {
File directory = userDirectory("Your target directory here");
List<File> files = readJavaFiles(directory);
System.out.println(files.size() + " files");
long start = System.currentTimeMillis();
List<SourceFile> sourceFiles = new ArrayList<>();
for(File file : files)
{
sourceFiles.add(new SourceFile(calcHashForEachLine(file)));
}
System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec/file");
double[][] similarity = null;
start = System.currentTimeMillis();
similarity = computeDistanceBasedOnJaccardSimilarity(sourceFiles);
System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec");
writeAsCsvFile(similarity, "c:/temp/similarity.txt");
}
private static double[][] computeDistanceBasedOnJaccardSimilarity(List<SourceFile> files) throws IOException
{
SourceFile[] fileArray = files.toArray(new SourceFile[0]);
final int size = fileArray.length;
JaccardSimlarity<Long> jaccardSimlarity = new JaccardSimlarity<>();
double[][] similarity = new double[size][size];
for(int i = 0; i < size; i++)
{
for(int j = i; j < size; j++)
{
double dist = 1-jaccardSimlarity.compute(fileArray[i].set, fileArray[j].set);
similarity[i][j] = similarity[j][i] = dist;
}
}
return similarity;
}
private static class SourceFile{
private final Set<Long> set;
public SourceFile(Set<Long> set) {
super();
this.set = set;
}
}
}
Trivial Utility Methods
package com.dukesoftware.utils.text;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Consumer;
import java.util.regex.Pattern;
public class Utils {
public static Set<Long> calcHashForEachLine(File file) throws IOException
{
Set<Long> set = new HashSet<>();
processLine(file, (line)->{
line = replaceMultipleSpaceAndTabToSingleSpace(line);
if(line.isEmpty()) return;
set.add(hash(line, 0));
});
return set;
}
public static void processLine(File file, Consumer<String> lineProcessor) throws IOException{
try(FileReader in = new FileReader(file);
BufferedReader br = new BufferedReader(in)){
String line;
while ((line = br.readLine()) != null) {
lineProcessor.accept(line);
}
}
}
// http://stackoverflow.com/questions/98153/whats-the-best-hashing-algorithm-to-use-on-a-stl-string-when-using-hash-map
private static long hash(String s, long seed)
{
long hash = seed;
char[] cs = s.toCharArray();
for(char c : cs)
{
hash = hash * 101 + c;
}
return hash;
}
public static File userDirectory(String dir){
return new File(System.getProperty("user.home"), dir);
}
public static void writeAsCsvFile(double[][] matrix, String path) throws IOException {
try(BufferedWriter bw = new BufferedWriter(new FileWriter(new File(path))))
{
for(int i = 0; i < matrix.length; i++)
{
bw.write(appendAsString(matrix[i], ","));
bw.newLine();
}
}
}
private final static Pattern PATTERN = Pattern.compile("[\\s]+");
private static String replaceMultipleSpaceAndTabToSingleSpace(String src)
{
return PATTERN.matcher(src).replaceAll(" ").trim();
}
private static <T> String appendAsString(double[] array, String delimiter){
if(array.length == 0){
return "";
}
StringBuilder sb = new StringBuilder();
for(double t : array){
sb.append(delimiter).append(t);
}
return sb.substring(delimiter.length(), sb.length());
}
public static List<File> readJavaFiles(File dir) throws IOException {
List<File> files = new ArrayList<>();
visit(dir, file -> {
if(file.getName().endsWith("java"))
{
files.add(file);
}
});
return files;
}
private static void visit(File file, Consumer<File> processor) {
if (file.isDirectory()) {
final File[] files = file.listFiles();
for (File child : files) {
visit(child, processor);
}
} else {
processor.accept(file);
}
}
}
コメント