Compute Source Code Similarity
I have tried to analyze source code by various methods recently.
In this post, I will show you my artifact - Compute source code similarity based on Jaccard Similarity Coefficient.
Jaccard Similarity Coefficient is an common and quick techniqueto compute similarity between 2 documents.
You can see more details in Jaccard index or MinHash.
My similarity computation strategy is below:
- Calculate hash value for each line for each files.
- Create set which contains hashes calculated in the above process for the each files.
- Compute Jaccard Similarity Coefficient for all combinations of the files.
Source Code
Now I can show you code snippets for compute source code similarity based on Jaccard Similarity Coefficient.Jaccard Simlarity
package com.dukesoftware.utils.text; import java.util.HashSet; import java.util.Set; /** * Jaccard similarity coefficient http://en.wikipedia.org/wiki/MinHash */ public class JaccardSimlarity<T> { private final Set<T> intersect = new HashSet<>(); private final Set<T> union = new HashSet<>(); public double compute(Set<T> set1, Set<T> set2) { intersect.clear(); intersect.addAll(set1); intersect.retainAll(set2); union.clear(); union.addAll(set1); union.addAll(set2); return (double)intersect.size()/(double)union.size(); } }
Compute Hashes For Source Files And Compute Similarity Base On Jaccard Index
package com.dukesoftware.utils.text; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Set; import static com.dukesoftware.utils.text.Utils.*; public class SimilarityByJaccardIndexBasedOnSimpleHash { public static void main(String[] args) throws IOException { File directory = userDirectory("Your target directory here"); List<File> files = readJavaFiles(directory); System.out.println(files.size() + " files"); long start = System.currentTimeMillis(); List<SourceFile> sourceFiles = new ArrayList<>(); for(File file : files) { sourceFiles.add(new SourceFile(calcHashForEachLine(file))); } System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec/file"); double[][] similarity = null; start = System.currentTimeMillis(); similarity = computeDistanceBasedOnJaccardSimilarity(sourceFiles); System.out.println((System.currentTimeMillis() - start) / (double)files.size()+" sec"); writeAsCsvFile(similarity, "c:/temp/similarity.txt"); } private static double[][] computeDistanceBasedOnJaccardSimilarity(List<SourceFile> files) throws IOException { SourceFile[] fileArray = files.toArray(new SourceFile[0]); final int size = fileArray.length; JaccardSimlarity<Long> jaccardSimlarity = new JaccardSimlarity<>(); double[][] similarity = new double[size][size]; for(int i = 0; i < size; i++) { for(int j = i; j < size; j++) { double dist = 1-jaccardSimlarity.compute(fileArray[i].set, fileArray[j].set); similarity[i][j] = similarity[j][i] = dist; } } return similarity; } private static class SourceFile{ private final Set<Long> set; public SourceFile(Set<Long> set) { super(); this.set = set; } } }
Trivial Utility Methods
package com.dukesoftware.utils.text; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.function.Consumer; import java.util.regex.Pattern; public class Utils { public static Set<Long> calcHashForEachLine(File file) throws IOException { Set<Long> set = new HashSet<>(); processLine(file, (line)->{ line = replaceMultipleSpaceAndTabToSingleSpace(line); if(line.isEmpty()) return; set.add(hash(line, 0)); }); return set; } public static void processLine(File file, Consumer<String> lineProcessor) throws IOException{ try(FileReader in = new FileReader(file); BufferedReader br = new BufferedReader(in)){ String line; while ((line = br.readLine()) != null) { lineProcessor.accept(line); } } } // http://stackoverflow.com/questions/98153/whats-the-best-hashing-algorithm-to-use-on-a-stl-string-when-using-hash-map private static long hash(String s, long seed) { long hash = seed; char[] cs = s.toCharArray(); for(char c : cs) { hash = hash * 101 + c; } return hash; } public static File userDirectory(String dir){ return new File(System.getProperty("user.home"), dir); } public static void writeAsCsvFile(double[][] matrix, String path) throws IOException { try(BufferedWriter bw = new BufferedWriter(new FileWriter(new File(path)))) { for(int i = 0; i < matrix.length; i++) { bw.write(appendAsString(matrix[i], ",")); bw.newLine(); } } } private final static Pattern PATTERN = Pattern.compile("[\\s]+"); private static String replaceMultipleSpaceAndTabToSingleSpace(String src) { return PATTERN.matcher(src).replaceAll(" ").trim(); } private static <T> String appendAsString(double[] array, String delimiter){ if(array.length == 0){ return ""; } StringBuilder sb = new StringBuilder(); for(double t : array){ sb.append(delimiter).append(t); } return sb.substring(delimiter.length(), sb.length()); } public static List<File> readJavaFiles(File dir) throws IOException { List<File> files = new ArrayList<>(); visit(dir, file -> { if(file.getName().endsWith("java")) { files.add(file); } }); return files; } private static void visit(File file, Consumer<File> processor) { if (file.isDirectory()) { final File[] files = file.listFiles(); for (File child : files) { visit(child, processor); } } else { processor.accept(file); } } }
コメント