Enterprise Java
Blog Categorisation using Encog, ROME, JSoup and Google Guava
Continuing with Programming Collection Intelligence (PCI) the next exercise was using the distance scores to pigeonhole a list of blogs based on the words used within the relevant blog.
I had already found Encog as the framework for the AI / Machine learning algorithms, for this exercise I needed an RSS reader and a HTML parser.
The 2 libraries I ended up using were:
For general other utilities and collection manipulations I used:
I kept the list of blogs short, included some of the software bloggers I follow, just to make testing quick, had to alter the %’s a little from the implementation in (PCI), but still got the desired result.
Blogs Used:
- http://blog.guykawasaki.com/index.rdf
- http://blog.outer-court.com/rss.xml
- http://flagrantdisregard.com/index.php/feed/
- http://gizmodo.com/index.xml
- http://googleblog.blogspot.com/rss.xml
- http://radar.oreilly.com/index.rdf
- http://www.wired.com/rss/index.xml
- http://feeds.feedburner.com/codinghorror
- http://feeds.feedburner.com/joelonsoftware
- http://martinfowler.com/feed.atom
- http://www.briandupreez.net/feeds/posts/default
For the implementation I just went with a main class and a reader class:
package net.briandupreez.pci.data; import com.google.common.base.Predicates; import com.google.common.collect.Collections2; import com.sun.syndication.feed.synd.SyndCategoryImpl; import com.sun.syndication.feed.synd.SyndContent; import com.sun.syndication.feed.synd.SyndEntryImpl; import com.sun.syndication.feed.synd.SyndFeed; import com.sun.syndication.io.SyndFeedInput; import com.sun.syndication.io.XmlReader; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.net.URL; import java.util.*; public class FeedReader { @SuppressWarnings("unchecked") public static Set<String> determineAllUniqueWords(final String url, final Set<String> blogWordList) { boolean ok = false; try { URL feedUrl = new URL(url); SyndFeedInput input = new SyndFeedInput(); SyndFeed feed = input.build(new XmlReader(feedUrl)); final List<SyndEntryImpl> entries = feed.getEntries(); for (final SyndEntryImpl entry : entries) { blogWordList.addAll(cleanAndSplitString(entry.getTitle())); blogWordList.addAll(doCategories(entry)); blogWordList.addAll(doDescription(entry)); blogWordList.addAll(doContent(entry)); } ok = true; } catch (Exception ex) { ex.printStackTrace(); System.out.println("ERROR: " + url + "\n" + ex.getMessage()); } if (!ok) { System.out.println("FeedReader reads and prints any RSS/Atom feed type."); System.out.println("The first parameter must be the URL of the feed to read."); } return blogWordList; } @SuppressWarnings("unchecked") private static List<String> doContent(final SyndEntryImpl entry) { List<String> blogWordList = new ArrayList<>(); final List<SyndContent> contents = entry.getContents(); if (contents != null) { for (final SyndContent syndContent : contents) { if ("text/html".equals(syndContent.getMode())) { blogWordList.addAll(stripHtmlAndAddText(syndContent)); } else { blogWordList.addAll(cleanAndSplitString(syndContent.getValue())); } } } return blogWordList; } private static List<String> doDescription(final SyndEntryImpl entry) { final List<String> blogWordList = new ArrayList<>(); final SyndContent description = entry.getDescription(); if (description != null) { if ("text/html".equals(description.getType())) { blogWordList.addAll(stripHtmlAndAddText(description)); } else { blogWordList.addAll(cleanAndSplitString(description.getValue())); } } return blogWordList; } @SuppressWarnings("unchecked") private static List<String> doCategories(final SyndEntryImpl entry) { final List<String> blogWordList = new ArrayList<>(); final List<SyndCategoryImpl> categories = entry.getCategories(); for (final SyndCategoryImpl category : categories) { blogWordList.add(category.getName().toLowerCase()); } return blogWordList; } private static List<String> stripHtmlAndAddText(final SyndContent description) { String html = description.getValue(); Document document = Jsoup.parse(html); Elements elements = document.getAllElements(); final List<String> allWords = new ArrayList<>(); for (final Element element : elements) { allWords.addAll(cleanAndSplitString(element.text())); } return allWords; } private static List<String> cleanAndSplitString(final String input) { if (input != null) { final String[] dic = input.toLowerCase().replaceAll("\\p{Punct}", "").replaceAll("\\p{Digit}", "").split("\\s+"); return Arrays.asList(dic); } return new ArrayList<>(); } @SuppressWarnings("unchecked") public static Map<String, Double> countWords(final String url, final Set<String> blogWords) { final Map<String, Double> resultMap = new TreeMap<>(); try { URL feedUrl = new URL(url); SyndFeedInput input = new SyndFeedInput(); SyndFeed feed = input.build(new XmlReader(feedUrl)); final List<SyndEntryImpl> entries = feed.getEntries(); final List<String> allBlogWords = new ArrayList<>(); for (final SyndEntryImpl entry : entries) { allBlogWords.addAll(cleanAndSplitString(entry.getTitle())); allBlogWords.addAll(doCategories(entry)); allBlogWords.addAll(doDescription(entry)); allBlogWords.addAll(doContent(entry)); } for (final String word : blogWords) { resultMap.put(word, (double) Collections2.filter(allBlogWords, Predicates.equalTo(word)).size()); } } catch (Exception ex) { ex.printStackTrace(); System.out.println("ERROR: " + url + "\n" + ex.getMessage()); } return resultMap; } }
Main:
package net.briandupreez.pci.data; import com.google.common.base.Predicates; import com.google.common.collect.Maps; import com.google.common.io.Resources; import com.google.common.primitives.Doubles; import org.encog.ml.MLCluster; import org.encog.ml.data.MLDataPair; import org.encog.ml.data.MLDataSet; import org.encog.ml.data.basic.BasicMLData; import org.encog.ml.data.basic.BasicMLDataPair; import org.encog.ml.data.basic.BasicMLDataSet; import org.encog.ml.kmeans.KMeansClustering; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.*; public class FeedReaderMain { public static void main(String[] args) { final FeedReaderMain feedReaderMain = new FeedReaderMain(); try { feedReaderMain.run(); } catch (IOException e) { e.printStackTrace(); } } public void run() throws IOException { final String file = Resources.getResource("short-feedlist.txt").getFile(); final Set<String> blogWords = determineWordCompleteList(file); final Map<String, Map<String, Double>> blogWordCount = countWordsPerBlog(file, blogWords); //strip out the outlying words stripOutlyingWords(blogWords, blogWordCount); performCusteringAndDisplay(blogWordCount); } private void performCusteringAndDisplay(final Map<String, Map<String, Double>> blogWordCount) { final BasicMLDataSet set = new BasicMLDataSet(); final Map<String, List<Double>> inputMap = new HashMap<>(); for (final Map.Entry<String, Map<String, Double>> entry : blogWordCount.entrySet()) { final Map<String, Double> mainValues = entry.getValue(); final double[] elements = Doubles.toArray(mainValues.values()); List<Double> listInput = Doubles.asList(elements); inputMap.put(entry.getKey(), listInput); set.add(new BasicMLData(elements)); } final KMeansClustering kmeans = new KMeansClustering(3, set); kmeans.iteration(150); // Display the cluster int i = 1; for (final MLCluster cluster : kmeans.getClusters()) { System.out.println("*** Cluster " + (i++) + " ***"); final MLDataSet ds = cluster.createDataSet(); final MLDataPair pair = BasicMLDataPair.createPair( ds.getInputSize(), ds.getIdealSize()); for (int j = 0; j < ds.getRecordCount(); j++) { ds.getRecord(j, pair); List<Double> listInput = Doubles.asList(pair.getInputArray()); System.out.println(Maps.filterValues(inputMap, Predicates.equalTo(listInput)).keySet().toString()); } } } private Map<String, Map<String, Double>> countWordsPerBlog(String file, Set<String> blogWords) throws IOException { BufferedReader reader; String line; reader = new BufferedReader(new FileReader(file)); final Map<String, Map<String, Double>> blogWordCount = new HashMap<>(); while ((line = reader.readLine()) != null) { final Map<String, Double> wordCounts = FeedReader.countWords(line, blogWords); blogWordCount.put(line, wordCounts); } return blogWordCount; } private Set<String> determineWordCompleteList(final String file) throws IOException { FileReader fileReader = new FileReader(file); BufferedReader reader = new BufferedReader(fileReader); String line; Set<String> blogWords = new HashSet<>(); while ((line = reader.readLine()) != null) { blogWords = FeedReader.determineAllUniqueWords(line, blogWords); System.out.println("Size: " + blogWords.size()); } return blogWords; } private void stripOutlyingWords(final Set<String> blogWords, final Map<String, Map<String, Double>> blogWordCount) { final Iterator<String> wordIter = blogWords.iterator(); final double listSize = blogWords.size(); while (wordIter.hasNext()) { final String word = wordIter.next(); double wordCount = 0; for (final Map<String, Double> values : blogWordCount.values()) { wordCount += values.get(word) != null ? values.get(word) : 0; } double percentage = (wordCount / listSize) * 100; if (percentage < 0.1 || percentage > 20 || word.length() < 3) { wordIter.remove(); for (final Map<String, Double> values : blogWordCount.values()) { values.remove(word); } } else { System.out.println("\t keeping: " + word + " Percentage:" + percentage); } } } }
The Results:
*** Cluster 1 *** [http://www.briandupreez.net/feeds/posts/default] *** Cluster 2 *** [http://blog.guykawasaki.com/index.rdf] [http://radar.oreilly.com/index.rdf] [http://googleblog.blogspot.com/rss.xml] [http://blog.outer-court.com/rss.xml] [http://gizmodo.com/index.xml] [http://flagrantdisregard.com/index.php/feed/] [http://www.wired.com/rss/index.xml] *** Cluster 3 *** [http://feeds.feedburner.com/joelonsoftware] [http://feeds.feedburner.com/codinghorror] [http://martinfowler.com/feed.atom]
Reference: Blog Categorisation using Encog, ROME, JSoup and Google Guava from our JCG partner Brian Du Preez at the Zen in the art of IT blog.