Skip to content

Commit

Permalink
Updated FormatUtility
Browse files Browse the repository at this point in the history
- Added convenience method "tokenizeText" which uses StandardTokenizer to tokenize arbitrary input string.
- Added convenience method "getHtmlText" which uses Jsoup to parse text from HTML source.
  • Loading branch information
JuicyDragon committed Aug 29, 2019
1 parent 83f9345 commit 0985562
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 0 deletions.
41 changes: 41 additions & 0 deletions Java/src/main/java/com/nuix/superutilities/misc/FormatUtility.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.nuix.superutilities.misc;

import java.io.File;
import java.io.StringReader;
import java.math.BigDecimal;
import java.text.NumberFormat;
import java.util.ArrayList;
Expand All @@ -15,7 +16,11 @@

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import nuix.Item;

Expand Down Expand Up @@ -312,6 +317,42 @@ public static double round(double value, int numberOfDigitsAfterDecimalPoint) {
return bigDecimal.doubleValue();
}

/***
* Convenience method for using Lucene's StandardTokenizer to tokenize arbitrary text.
* @param inputText The text to tokenize
* @return A list of tokens parsed from inputText by Lucene.
* @throws Exception Most likely thrown if Lucene tokenizer encounters a problem.
*/
public static List<String> tokenizeText(String inputText) throws Exception{
List<String> tokens = new ArrayList<String>();

if(inputText != null) {
try(StringReader stringReader = new StringReader(inputText)){
try(StandardTokenizer tokenizer = new StandardTokenizer()){
tokenizer.setReader(stringReader);
CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
tokenizer.reset();
while(tokenizer.incrementToken()) {
tokens.add(attribute.toString());
}
tokenizer.end();
}
}
}
return tokens;
}

/***
* Convenience method for getting the "rendered text" (text as seen in browser) of HTML source code using Jsoup.
* @param htmlSource The HTML source code to parse the text from.
* @return The "rendered text" of the provided HTML source code.
*/
public static String getHtmlText(String htmlSource) {
Document doc = Jsoup.parse(htmlSource);
String result = doc.body().text();
return result;
}

public static String formatAsTextualTable(List<List<String>> rows) {
List<Integer> columnWidths = new ArrayList<Integer>();
for(List<String> row : rows) {
Expand Down
102 changes: 102 additions & 0 deletions RubyTests/Test_FormatUtility.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
script_directory = File.dirname(__FILE__)
require File.join(script_directory,"SuperUtilities.jar")
java_import com.nuix.superutilities.SuperUtilities
$su = SuperUtilities.init($utilities,NUIX_VERSION)
java_import com.nuix.superutilities.misc.FormatUtility

puts "===== HEX => BYTES / BYTES => HEX ====="
md5_string = "8b69e50c471a3b73b021d2182c372eca"
md5_bytes = FormatUtility.hexToBytes(md5_string)
converted_md5_string = FormatUtility.bytesToHex(md5_bytes)

puts "md5_string: #{md5_string}"
puts "converted_md5_string: #{converted_md5_string}"

puts "===== ELAPSED STRING ====="
def to_seconds(days=0,hours=0,minutes=0,seconds=0)
return seconds + (minutes * 60) + (hours * 60 * 60) + (days * 24 * 60 * 60)
end

5.times do
days = rand(0..3)
hours = rand(0..23)
minutes = rand(0..59)
seconds = rand(0..59)

elapsed_string = FormatUtility.getInstance.secondsToElapsedString(to_seconds(days,hours,minutes,seconds))
puts "#{days} days, #{hours} hours, #{minutes} minutes and #{seconds} seconds => #{elapsed_string}"
end

puts "===== RESOLVE PLACEHOLDERS ====="
template_string = "Hello {name}, I heard your favorite color is {color}."
placeholder_values = {
"name" => "Bob",
"color" => "Green",
}
puts "template_string: #{template_string}"
puts "placeholder_values:"
placeholder_values.each do |key,value|
puts "#{key} => #{value}"
end
puts "Resolved: #{FormatUtility.getInstance.resolvePlaceholders(template_string,placeholder_values)}"

puts "===== TOKENIZE TEXT ===="
sample_text =<<SAMPLETEXT
Hello Moses,
Here's everyone who has yet to turn in a TPS report:
1.Kyler Douglas from Games & Tools
2.Jairo Schamberger from Jewelry
3.Douglas from Games & Tools
4.Schamberger from Jewelry
5.Schamberger from Jewelry
6.Ena Baumbach from Health
7.McLaughlin from Music
And with that we move on to:
Sometimes I feel like:
Supah Beetle: Arsenal
with the power of Energy Resistance which I use to keep the Grocery, Shoes & Tools department at bay!
SAMPLETEXT

puts "sample_text:"
puts sample_text

puts "Tokens:"
puts FormatUtility.tokenizeText(sample_text)

puts "===== GET HTML TEXT ====="
test_html_source =<<HTML
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Report</title>
<style>
html, body {
margin: 0px;
padding: 0px;
height: 100%;
}
</style>
</head>
<body>
<h1>Document Header</h1>
Here is a list of colors
<ul>
<li>Red</li>
<li>Green</li>
<li>Blue</li>
</ul>
Visit us on <a href="https://github.com/nuix">GitHub</a>.
</body>
</html>
HTML

puts "test_html_source:"
puts test_html_source

puts "HTML Text:"
puts FormatUtility.getHtmlText(test_html_source)

0 comments on commit 0985562

Please sign in to comment.