Updated FormatUtility

- Added convenience method "tokenizeText" which uses StandardTokenizer to tokenize arbitrary input string. - Added convenience method "getHtmlText" which uses Jsoup to parse text from HTML source.
Nuix · Aug 29, 2019 · 0985562 · 0985562
1 parent 83f9345
commit 0985562
Show file tree

Hide file tree

Showing 2 changed files with 143 additions and 0 deletions.
diff --git a/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java b/Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java
@@ -1,6 +1,7 @@
 package com.nuix.superutilities.misc;
 
 import java.io.File;
+import java.io.StringReader;
 import java.math.BigDecimal;
 import java.text.NumberFormat;
 import java.util.ArrayList;
@@ -15,7 +16,11 @@
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.joda.time.DateTime;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
 
 import nuix.Item;
 
@@ -312,6 +317,42 @@ public static double round(double value, int numberOfDigitsAfterDecimalPoint) {
         return bigDecimal.doubleValue();
     }
 
+	/***
+	 * Convenience method for using Lucene's StandardTokenizer to tokenize arbitrary text.
+	 * @param inputText The text to tokenize
+	 * @return A list of tokens parsed from inputText by Lucene.
+	 * @throws Exception Most likely thrown if Lucene tokenizer encounters a problem.
+	 */
+	public static List<String> tokenizeText(String inputText) throws Exception{
+		List<String> tokens = new ArrayList<String>();
+
+		if(inputText != null) {
+			try(StringReader stringReader = new StringReader(inputText)){
+				try(StandardTokenizer tokenizer = new StandardTokenizer()){
+					tokenizer.setReader(stringReader);
+					CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
+					tokenizer.reset();
+					while(tokenizer.incrementToken()) {
+						tokens.add(attribute.toString());
+					}
+					tokenizer.end();
+				}
+			}
+		}
+		return tokens;
+	}
+
+	/***
+	 * Convenience method for getting the "rendered text" (text as seen in browser) of HTML source code using Jsoup.
+	 * @param htmlSource The HTML source code to parse the text from.
+	 * @return The "rendered text" of the provided HTML source code.
+	 */
+	public static String getHtmlText(String htmlSource) {
+		Document doc = Jsoup.parse(htmlSource);
+		String result = doc.body().text();
+		return result;
+	}
+
 	public static String formatAsTextualTable(List<List<String>> rows) {
 		List<Integer> columnWidths = new ArrayList<Integer>();
 		for(List<String> row : rows) {

diff --git a/RubyTests/Test_FormatUtility.rb b/RubyTests/Test_FormatUtility.rb
@@ -0,0 +1,102 @@
+script_directory = File.dirname(__FILE__)
+require File.join(script_directory,"SuperUtilities.jar")
+java_import com.nuix.superutilities.SuperUtilities
+$su = SuperUtilities.init($utilities,NUIX_VERSION)
+java_import com.nuix.superutilities.misc.FormatUtility
+
+puts "===== HEX => BYTES / BYTES => HEX ====="
+md5_string = "8b69e50c471a3b73b021d2182c372eca"
+md5_bytes = FormatUtility.hexToBytes(md5_string)
+converted_md5_string = FormatUtility.bytesToHex(md5_bytes)
+
+puts "md5_string: #{md5_string}"
+puts "converted_md5_string: #{converted_md5_string}"
+
+puts "===== ELAPSED STRING ====="
+def to_seconds(days=0,hours=0,minutes=0,seconds=0)
+	return seconds + (minutes * 60) + (hours * 60 * 60) + (days * 24 * 60 * 60)
+end
+
+5.times do
+	days = rand(0..3)
+	hours = rand(0..23)
+	minutes = rand(0..59)
+	seconds = rand(0..59)
+
+	elapsed_string = FormatUtility.getInstance.secondsToElapsedString(to_seconds(days,hours,minutes,seconds))
+	puts "#{days} days, #{hours} hours, #{minutes} minutes and #{seconds} seconds => #{elapsed_string}"
+end
+
+puts "===== RESOLVE PLACEHOLDERS ====="
+template_string = "Hello {name}, I heard your favorite color is {color}."
+placeholder_values = {
+	"name" => "Bob",
+	"color" => "Green",
+}
+puts "template_string: #{template_string}"
+puts "placeholder_values:"
+placeholder_values.each do |key,value|
+	puts "#{key} => #{value}"
+end
+puts "Resolved: #{FormatUtility.getInstance.resolvePlaceholders(template_string,placeholder_values)}"
+
+puts "===== TOKENIZE TEXT ===="
+sample_text =<<SAMPLETEXT
+Hello Moses,
+
+Here's everyone who has yet to turn in a TPS report:
+1.Kyler Douglas from Games & Tools
+2.Jairo Schamberger from Jewelry
+3.Douglas from Games & Tools
+4.Schamberger from Jewelry
+5.Schamberger from Jewelry
+6.Ena Baumbach from Health
+7.McLaughlin from Music
+And with that we move on to: 
+
+Sometimes I feel like:
+Supah Beetle: Arsenal
+with the power of Energy Resistance which I use to keep the Grocery, Shoes & Tools department at bay!
+SAMPLETEXT
+
+puts "sample_text:"
+puts sample_text
+
+puts "Tokens:"
+puts FormatUtility.tokenizeText(sample_text)
+
+puts "===== GET HTML TEXT ====="
+test_html_source =<<HTML
+<!DOCTYPE html>
+<html>
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+	<title>Report</title>
+	<style>
+	html, body {
+		margin: 0px;
+		padding: 0px;
+		height: 100%;
+	}
+	</style>
+</head>
+<body>
+	<h1>Document Header</h1>
+	
+	Here is a list of colors
+	<ul>
+	<li>Red</li>
+	<li>Green</li>
+	<li>Blue</li>
+	</ul>
+
+	Visit us on <a href="https://github.com/nuix">GitHub</a>.
+</body>
+</html>
+HTML
+
+puts "test_html_source:"
+puts test_html_source
+
+puts "HTML Text:"
+puts FormatUtility.getHtmlText(test_html_source)