JsoupHtmlConverter.java

/*
 * Copyright 2021 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.genesys.server.service.impl;

import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.genesys.server.service.HtmlConverter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.cache.annotation.Cacheable;
import org.springframework.stereotype.Service;

/**
 * Based on
 * https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples
 * /HtmlToPlainText.java
 */
@Service
public class JsoupHtmlConverter implements HtmlConverter {

	@Value("${base.url}")
	private String baseUrl;
	
	@Override
	@Cacheable(value = "htmltotextcache", key = "#html", unless = "#result == null")
	public String toText(String html) {
		if (StringUtils.isBlank(html)) {
			return StringUtils.EMPTY;
		}
		Document doc = Jsoup.parseBodyFragment(html);
		return getPlainText(doc);
	}

	/**
	 * Format an Element to plain-text
	 * 
	 * @param element
	 *            the root element to format
	 * @return formatted text
	 */
	private String getPlainText(Element element) {
		FormattingVisitor formatter = new FormattingVisitor();
		// walk the DOM, and call .head() and .tail() for each node
		NodeTraversor.traverse(formatter, element);
		return formatter.toString().trim();
	}

	/**
	 * the formatting rules, implemented in a breadth-first DOM traverse
	 */
	private class FormattingVisitor implements NodeVisitor {
//		private static final int maxWidth = 80;
//		private int width = 0;

		// holds the accumulated text
		private StringBuilder accum = new StringBuilder();

		public FormattingVisitor() {
		}
		
		/**
		 * hit when the node is first seen
		 */
		@Override
		public void head(Node node, int depth) {
			String name = node.nodeName();
			if (node instanceof TextNode)
				// TextNodes carry all user-readable text in the DOM.
				append(((TextNode) node).text());
			else if (name.equals("li"))
				append("\n * ");
			else if (name.equals("dt"))
				append("  ");
			else if (List.of("p", "h1", "h2", "h3", "h4", "h5", "tr").contains(name))
				append("\n");
		}

		/**
		 * hit when all of the node's children (if any) have been visited
		 */
		@Override
		public void tail(Node node, int depth) {
			String name = node.nodeName();
			if (List.of("br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5").contains(name))
				append("\n");
			else if (name.equals("a")) {
				node.setBaseUri(baseUrl);
				append(String.format(" <%s>", node.absUrl("href")));
			}
		}

		private void append(String text) {
//			if (text.startsWith("\n"))
//				// reset counter if starts with a newline. only from formats above, not in natural text
//				width = 0;

			if (text.equals(" ") && (accum.length() == 0 || List.of(" ", "\n").contains(accum.substring(accum.length() - 1))))
				return; // don't accumulate long runs of empty spaces

			// Wrapping
			// if (text.length() + width > maxWidth) { // won't fit, needs to
			// wrap
			// String words[] = text.split("\\s+");
			// for (int i = 0; i < words.length; i++) {
			// String word = words[i];
			// boolean last = i == words.length - 1;
			// if (!last) // insert a space if not the last word
			// word = word + " ";
			// if (word.length() + width > maxWidth) { // wrap and reset
			// // counter
			// accum.append("\n").append(word);
			// width = word.length();
			// } else {
			// accum.append(word);
			// width += word.length();
			// }
			// }
			// } else
			
			{ // fits as is, without need to wrap text
				accum.append(text);
//				width += text.length();
			}
		}

		@Override
		public String toString() {
			return accum.toString();
		}
	}
}