Core Java
Generic Text Comparison Tool with LCS Approach
Detecting and showing differences of two texts (especially having hundreds or thousands of lines) is a common problem. Using pure java.lang.String class methods may be a solution, but the most important issue for that kind of operations, “performance” will not be satisfactory. We want an efficient solution which may have a view as below:
Text Difference Tool Example |
The problem contains two parts:
- Detecting differences of two texts: For detecting differences, an efficient dynamic algorithm of LCS (Longest Common Subsequence) used in this solution. This solution has O(text1WordCount * text2WordCount) complexity and coded as “longestCommonSubsequence” method below.
- Visualizing the differences: For visualizing, an HTML tag based approach is used, which marks new words of text2 with green color and old words of text1 with red color. This solution has O(changedWordsCount * (text1WordCount+text2WordCount)) complexity and coded as “markTextDifferences” method below.
Note1: For simplicity, “normalizeText” method is used for removing \n, \t and multiple space characters. Note2: This class was created as a Vaadin component. But “longestCommonSubsequence” is pure generic and “markTextDifferences” method is generic on HTML based visual components, so they can also be used with different frameworks.
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | import java.util.ArrayList; import com.vaadin.ui.CustomComponent; import com.vaadin.ui.Label; import com.vaadin.ui.Layout; import com.vaadin.ui.VerticalLayout; /** * Text comparison component which marks differences of two texts with colors. * * @author cb */ public class TextCompareComponent extends CustomComponent { private Layout mainLayout = new VerticalLayout(); private ArrayList<String> longestCommonSubsequenceList; private static final String INSERT_COLOR = "#99FFCC" ; private static final String DELETE_COLOR = "#CB6D6D" ; public TextCompareComponent(String text1, String text2) { text1 = normalizeText(text1); text2 = normalizeText(text2); this .longestCommonSubsequenceList = longestCommonSubsequence(text1, text2); String result = markTextDifferences(text1, text2, longestCommonSubsequenceList, INSERT_COLOR, DELETE_COLOR); Label label = new Label(result, Label.CONTENT_XHTML); mainLayout.addComponent(label); setCompositionRoot(mainLayout); } /** * Finds a list of longest common subsequences (lcs) of given two texts. * * @param text1 * @param text2 * @return - longest common subsequence list */ private ArrayList<String> longestCommonSubsequence(String text1,String text2){ String[] text1Words = text1.split( " " ); String[] text2Words = text2.split( " " ); int text1WordCount = text1Words.length; int text2WordCount = text2Words.length; int [][] solutionMatrix = new int [text1WordCount + 1 ][text2WordCount + 1 ]; for ( int i = text1WordCount - 1 ; i >= 0 ; i--) { for ( int j = text2WordCount - 1 ; j >= 0 ; j--) { if (text1Words[i].equals(text2Words[j])){ solutionMatrix[i][j] = solutionMatrix[i + 1 ][j + 1 ] + 1 ; } else { solutionMatrix[i][j] = Math.max(solutionMatrix[i + 1 ][j], solutionMatrix[i][j + 1 ]); } } } int i = 0 , j = 0 ; ArrayList<String> lcsResultList = new ArrayList<String>(); while (i < text1WordCount && j < text2WordCount) { if (text1Words[i].equals(text2Words[j])) { lcsResultList.add(text2Words[j]); i++; j++; } else if (solutionMatrix[i + 1 ][j] >= solutionMatrix[i][j + 1 ]) { i++; } else { j++; } } return lcsResultList; } /** * Normalizes given string by deleting \n, \t and extra spaces. * * @param text - initial string * @return - normalized string */ private String normalizeText(String text) { text = text.trim(); text = text.replace( "\n" , " " ); text = text.replace( "\t" , " " ); while (text.contains( " " )) { text = text.replace( " " , " " ); } return text; } /** * Returns colored inserted/deleted text representation of given two texts. * Uses longestCommonSubsequenceList to determine colored sections. * * @param text1 * @param text2 * @param lcsList * @param insertColor * @param deleteColor * @return - colored result text */ private String markTextDifferences(String text1, String text2, ArrayList<String> lcsList, String insertColor, String deleteColor) { StringBuffer stringBuffer = new StringBuffer(); if (text1 != null && lcsList != null ) { String[] text1Words = text1.split( " " ); String[] text2Words = text2.split( " " ); int i = 0 , j = 0 , word1LastIndex = 0 , word2LastIndex = 0 ; for ( int k = 0 ; k < lcsList.size(); k++) { for (i = word1LastIndex, j = word2LastIndex; i < text1Words.length && j < text2Words.length;) { if (text1Words[i].equals(lcsList.get(k)) && text2Words[j].equals(lcsList.get(k))) { stringBuffer.append( "<SPAN>" + lcsList.get(k) + " </SPAN>" ); word1LastIndex = i + 1 ; word2LastIndex = j + 1 ; i = text1Words.length; j = text2Words.length; } else if (!text1Words[i].equals(lcsList.get(k))) { for (; i < text1Words.length && !text1Words[i].equals(lcsList.get(k)); i++) { stringBuffer.append( "<SPAN style='BACKGROUND-COLOR:" + deleteColor + "'>" + text1Words[i] + " </SPAN>" ); } } else if (!text2Words[j].equals(lcsList.get(k))) { for (; j < text2Words.length && !text2Words[j].equals(lcsList.get(k)), j++) { stringBuffer.append( "<SPAN style='BACKGROUND-COLOR:" + insertColor + "'>" + text2Words[j] + " </SPAN>" ); } } } } for (; word1LastIndex < text1Words.length; word1LastIndex++) { stringBuffer.append( "<SPAN style='BACKGROUND-COLOR:" + deleteColor + "'>" + text1Words[word1LastIndex] + " </SPAN>" ); } for (; word2LastIndex < text2Words.length; word2LastIndex++) { stringBuffer.append( "<SPAN style='BACKGROUND-COLOR:" + insertColor + "'>" + text2Words[word2LastIndex] + " </SPAN>" ); } } return stringBuffer.toString(); } } |
Reference: A Generic Text Comparison Tool Implementation with LCS Approach from our JCG partner Cagdas Basaraner at the CodeBuild blog.