Compute Source Code Similarity
I have tried to analyze source code by various methods recently.
In this post, I will show you my artifact - Compute source code similarity based on Jaccard Similarity Coefficient.
Jaccard Similarity Coefficient is an common and quick techniqueto compute similarity between 2 documents.
You can see more details in Jaccard index or MinHash.
My similarity computation strategy is below:
- Calculate hash value for each line for each files.
- Create set which contains hashes calculated in the above process for the each files.
- Compute Jaccard Similarity Coefficient for all combinations of the files.
Source Code
Now I can show you code snippets for compute source code similarity based on Jaccard Similarity Coefficient.Jaccard Simlarity
package com.dukesoftware.utils.text;
import java.util.HashSet;
import java.util.Set;
/**
 * Jaccard similarity coefficient http://en.wikipedia.org/wiki/MinHash
 */
public class JaccardSimlarity<T> {
    private final Set<T> intersect = new HashSet<>();
    private final Set<T> union = new HashSet<>();
    
    public double compute(Set<T> set1, Set<T> set2)
    {
        intersect.clear();
        intersect.addAll(set1);
        intersect.retainAll(set2);
        union.clear();
        union.addAll(set1);
        union.addAll(set2);
        return (double)intersect.size()/(double)union.size();
    }
}
Compute Hashes For Source Files And Compute Similarity Base On Jaccard Index
package com.dukesoftware.utils.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import static com.dukesoftware.utils.text.Utils.*;
public class SimilarityByJaccardIndexBasedOnSimpleHash {
    public static void main(String[] args) throws IOException {
        File directory = userDirectory("Your target directory here");
        List<File> files = readJavaFiles(directory);
        System.out.println(files.size() + " files");
        long start = System.currentTimeMillis();
        List<SourceFile> sourceFiles = new ArrayList<>();
        for(File file : files)
        {
            sourceFiles.add(new SourceFile(calcHashForEachLine(file)));
        }
        System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec/file");
        double[][] similarity = null;
        
        start = System.currentTimeMillis();
        similarity = computeDistanceBasedOnJaccardSimilarity(sourceFiles);
        System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec");
        
        writeAsCsvFile(similarity, "c:/temp/similarity.txt");
    }
    private static double[][] computeDistanceBasedOnJaccardSimilarity(List<SourceFile> files) throws IOException
    {
        SourceFile[] fileArray = files.toArray(new SourceFile[0]);
        final int size = fileArray.length;
        
        JaccardSimlarity<Long> jaccardSimlarity = new JaccardSimlarity<>();
        double[][] similarity = new double[size][size];
        for(int i = 0; i < size; i++)
        {
            for(int j = i; j < size; j++)
            {
                double dist = 1-jaccardSimlarity.compute(fileArray[i].set, fileArray[j].set);
                similarity[i][j] = similarity[j][i] = dist;
            }
        }
        return similarity;
    }
    
    private static class SourceFile{
        private final Set<Long> set;
        public SourceFile(Set<Long> set) {
            super();
            this.set = set;
        }
    }
}
Trivial Utility Methods
package com.dukesoftware.utils.text;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Consumer;
import java.util.regex.Pattern;
public class Utils {
    
    public static Set<Long> calcHashForEachLine(File file) throws IOException
    {
        Set<Long> set = new HashSet<>();
        processLine(file, (line)->{
            line = replaceMultipleSpaceAndTabToSingleSpace(line);
            if(line.isEmpty()) return;
            set.add(hash(line, 0));
        });
        return set;
    }
    
    public static void processLine(File file, Consumer<String> lineProcessor) throws IOException{
        try(FileReader in = new FileReader(file);
      BufferedReader br = new BufferedReader(in)){
            String line;
            while ((line = br.readLine()) != null) {
                lineProcessor.accept(line);
            }
        }
    }
        
    // http://stackoverflow.com/questions/98153/whats-the-best-hashing-algorithm-to-use-on-a-stl-string-when-using-hash-map
    private static long hash(String s, long seed)
    {
        long hash = seed;
        char[] cs = s.toCharArray();
        for(char c : cs)
        {
            hash = hash * 101  +  c;
        }
        return hash;
    }
    
    public static File userDirectory(String dir){
        return new File(System.getProperty("user.home"), dir);
    }
    
    public static void writeAsCsvFile(double[][] matrix, String path) throws IOException {
        try(BufferedWriter bw = new BufferedWriter(new FileWriter(new File(path))))
        {
            for(int i = 0; i < matrix.length; i++)
            {
                bw.write(appendAsString(matrix[i], ","));
                bw.newLine();
            }
        }
    }
    
    private final static Pattern PATTERN = Pattern.compile("[\\s]+");
    private static String replaceMultipleSpaceAndTabToSingleSpace(String src)
    {
        return PATTERN.matcher(src).replaceAll(" ").trim();
    }
    
    private static <T> String appendAsString(double[] array, String delimiter){
        if(array.length == 0){
            return "";
        }
        StringBuilder sb = new StringBuilder();
        for(double t : array){
            sb.append(delimiter).append(t);
        }
        return sb.substring(delimiter.length(), sb.length());
    }
    
    public static List<File> readJavaFiles(File dir) throws IOException {
        List<File> files = new ArrayList<>();
        visit(dir, file -> {
            if(file.getName().endsWith("java"))
            {
                files.add(file);
            }
        });
        return files;
    }
    
    private static void visit(File file, Consumer<File> processor) {
        if (file.isDirectory()) {
            final File[] files = file.listFiles();
            for (File child : files) {
                visit(child, processor);
            }
        } else {
            processor.accept(file);
        }
    }
}
コメント