CitationRisConverter.java

/*
 * Copyright 2024 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.genesys.server.service.worker.bib;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import org.apache.commons.lang3.StringUtils;
import org.genesys.server.api.v2.model.bib.AuthorDTO;
import org.genesys.server.api.v2.model.bib.CitationDTO;
import org.genesys.server.model.bib.CitationType;
import org.springframework.stereotype.Component;

import lombok.extern.slf4j.Slf4j;

/**
 * RIS is a standard file format for citations.
 */
@Component
@Slf4j
public class CitationRisConverter {
	private static final String RIS_SEPARATOR = "  - ";
	private static final String RIS_EOL = "\r\n";
	public static String TYPE_OF_REFERENCE = "TY";
	public static String END_OF_REFERENCE = "ER";
	public static String TITLE = "TI";
	public static String TITLE_1 = "T1";
	public static String TITLE_2 = "T2"; // Commonly the name of journal, periodical
	public static String TRANSLATED_TITLE = "TT";
	public static String LANGUAGE = "LA";
	public static String KEYWORD = "KW";
	public static String KEYWORD_1 = "K1";
	public static String PUBLICATION_YEAR = "PY";
	public static String PUBLICATION_YEAR_1 = "YR";
	public static String URL = "UR";
	public static String URL_1 = "L1"; // Link to PDF. These links should end with a file name, and not simply a landing page.
	public static String URL_2 = "L2"; // Link to full-text.
	public static String AUTHOR = "AU";
	public static String DOI = "DO";
	public static String ABSTRACT = "AB";
	public static String NOTE_1 = "N1";
	public static String NOTE_2 = "N2";
	public static String JOURNAL_FULL = "JF";
	public static String JOURNAL_FULL_1 = "JO";
	public static String JOURNAL = "JA";
	public static String JOURNAL_2 = "J2";
	public static String PUBLISHER = "PB";
	public static String SERIAL_NUMBER = "SN";
	public static String VOLUME = "VL";
	public static String ISSUE = "IS";
	public static String START_PAGE = "SP";
	public static String END_PAGE = "EP";

	private final Pattern RIS_PATTERN = Pattern.compile("([A-Z][A-Z\\d])  - ?([^$]+)?");
	private final Pattern RIS_HTML_ITALIC = Pattern.compile("<em>|</em>|<i>|</i>", Pattern.CASE_INSENSITIVE);
	private final Pattern RIS_HTML_BOLD = Pattern.compile("<b>|</b>", Pattern.CASE_INSENSITIVE);

	/**
	 * Read citations in RIS format
	 *
	 * @param input RIS data
	 * @return Stream of citations
	 * @throws IOException
	 * @see <a href="https://en.wikipedia.org/wiki/RIS_(file_format)">RIS_(file_format)</a>
	 */
	public Stream<CitationDTO> readCitations(Reader input) throws IOException {
		Stream.Builder<CitationDTO> stream = Stream.builder();
		var reader = new BufferedReader(input, 1024);
		while (true) {
			var citation = readCitation(reader);
			if (citation != null) {
				stream.add(citation);
			} else {
				break;
			}
		}
		return stream.build();
	}

	public CitationDTO readCitation(BufferedReader reader) throws IOException {
		CitationDTO citation = null;
		String line = null;
		StringBuffer source = null;

		while ((line = reader.readLine()) != null) {
			log.trace("<< {}", line);

			var ris = RIS_PATTERN.matcher(line);
			if (!ris.matches()) {
				log.debug("Skipping non-RIS line: {}", line);
				continue;
			}

			if (source != null) source.append(line).append(RIS_EOL);

			// if (StringUtils.isBlank(line)) continue;
			String tag = ris.group(1);
			String data = ris.group(2);
			log.debug("RIS {}: {}", tag, data);

			if (Objects.equals(END_OF_REFERENCE, tag)) {
				if (citation != null) citation.setSourceData(source.toString());
				log.debug("Created {}", citation);
				return citation;
			}

			data = StringUtils.trimToEmpty(data);
			if (StringUtils.isBlank(data)) continue; // Skip tag without data

			{
				// Cleanup
				if (data.contains("<")) { // Process allowed HTML tags into Markdown
					data = RIS_HTML_ITALIC.matcher(data).replaceAll("*");
					data = RIS_HTML_BOLD.matcher(data).replaceAll("**");
				}
				if (data.contains("\t")) { // Replace tab with space
					data = data.replaceAll("\\t", " ");
				}
				data = data.replaceAll("\\s\\s+", " "); // Remove double whitespace
			}

			if (Objects.equals(TYPE_OF_REFERENCE, tag)) {
				source = new StringBuffer(1024);
				source.append(line).append(RIS_EOL); // Add start line

				citation = new CitationDTO(); // new citation
				citation.setType(CitationType.forValue(data));
				log.info("New citation of type {}", citation.getType());

			}

			if (citation == null) continue; // Don't read, we don't have a target

			if (Objects.equals(TITLE, tag)) {
				citation.setTitle(data);

			} else if (Objects.equals(TITLE_1, tag) && StringUtils.isBlank(citation.getTitle())) {
				citation.setTitle(data);

			} else if (Objects.equals(DOI, tag)) {
				citation.setDoi(data);

			} else if (Objects.equals(PUBLICATION_YEAR, tag) || Objects.equals(PUBLICATION_YEAR_1, tag)) {
				try {
					citation.setPublicationYear(Integer.parseInt(data));
				} catch (NumberFormatException e) {
					log.warn("Invalid publication year {}", data);
				}

			} else if (Objects.equals(AUTHOR, tag)) {
				if (citation.getAuthors() == null) citation.setAuthors(new ArrayList<>());
				var author = new AuthorDTO();
				var names = data.split(" ?,\\s*", 2);
				author.setLastName(StringUtils.trimToNull(names[0]));
				if (names.length > 1) author.setOtherNames(StringUtils.trimToNull(names[1]));
				citation.getAuthors().add(author);

			} else if (Objects.equals(KEYWORD, tag) || Objects.equals(KEYWORD_1, tag)) {
				if (citation.getKeywords() == null) citation.setKeywords(new ArrayList<>());
				citation.getKeywords().add(data);

			} else if (Objects.equals(URL, tag) || Objects.equals(URL_1, tag) || Objects.equals(URL_2, tag)) {
				if (citation.getUrls() == null) citation.setUrls(new ArrayList<>());
				var urls = data.split(";");
				if (urls.length > 0) {
					Arrays.stream(urls).map(StringUtils::trimToNull).filter(Objects::nonNull).forEach(citation.getUrls()::add);
					if (Objects.equals(URL_1, tag) && citation.getDownloadUrl() == null)
						citation.setDownloadUrl(StringUtils.trimToNull(urls[0]));
				} else {
					log.debug("No URLs in {}", data);
				}

			} else if (Objects.equals(ABSTRACT, tag)) {
				citation.setAbstractText(readMultiline(reader, data, source));

			} else if ((Objects.equals(NOTE_1, tag) || Objects.equals(NOTE_2, tag)) && StringUtils.isBlank(citation.getAbstractText())) {
				citation.setAbstractText(readMultiline(reader, data, source));

			} else if (Objects.equals(PUBLISHER, tag)) {
				citation.setPublisher(data);

			} else if (Objects.equals(JOURNAL_FULL, tag) || Objects.equals(JOURNAL_FULL_1, tag)) {
				citation.setPublication(data);

			} else if (Objects.equals(TITLE_2, tag) || Objects.equals(JOURNAL, tag) || Objects.equals(JOURNAL_2, tag)) {
				if (citation.getPublication() == null) citation.setPublication(data);

			} else if (Objects.equals(LANGUAGE, tag)) {
				citation.setLanguage(data);

			} else if (Objects.equals(SERIAL_NUMBER, tag)) {
				citation.setSerialNumber(data);

			} else if (Objects.equals(VOLUME, tag)) {
				citation.setVolume(data);

			} else if (Objects.equals(ISSUE, tag)) {
				citation.setIssue(data);

			} else if (Objects.equals(START_PAGE, tag)) {
				citation.setPages(data);

			} else if (Objects.equals(END_PAGE, tag)) {
				if (citation.getPages() != null) {
					citation.setPages(citation.getPages() + "-" + data);
				}
			}
		}

		return null; // We didn't reach end-of-record "ER" tag
	}

	/**
	 * Attempt to read multi-line text by checking if the next line is a RIS command or not.
	 * Concatenate the lines until a RIS command is reached.
	 *
	 * @param reader source reader
	 * @param initialData first line
	 * @return multi-line text
	 * @throws IOException
	 */
	private String readMultiline(BufferedReader reader, String initialData, StringBuffer source) throws IOException {
		StringBuilder sb = new StringBuilder(initialData);
		// multiline support
		char[] cbuf = new char[5];
		while (true) {
			reader.mark(10);
			if (reader.read(cbuf) > 0) {
				var nextPreview = new String(cbuf);
				reader.reset();
				if (RIS_PATTERN.matcher(nextPreview).matches()) {
					log.trace("Next line is a RIS command: {}", nextPreview);
					break;
				} else {
					var line = reader.readLine();
					sb.append("\n").append(line);
					source.append(line).append("\n");
				}
			} else {
				break;
			}
		}

		return sb.toString();
	}

	/**
	 * Write citation in RIS format
	 *
	 * @param citation the citation to export in RIS
	 * @return citation in RIS format
	 */
	public Writer writeCitation(CitationDTO citation, Writer writer) throws IOException {
		
		writer.append(TYPE_OF_REFERENCE).append(RIS_SEPARATOR).append(citation.getType().getCode()).append(RIS_EOL);

		writer.append(TITLE).append(RIS_SEPARATOR).append(citation.getTitle()).append(RIS_EOL);

		if (StringUtils.isNotBlank(citation.getDoi())) {
			writer.append(DOI).append(RIS_SEPARATOR).append(citation.getDoi()).append(RIS_EOL);
		}

		if (citation.getPublicationYear() != null) {
			writer.append(PUBLICATION_YEAR).append(RIS_SEPARATOR).append(citation.getPublicationYear().toString()).append(RIS_EOL);
		}

		if (StringUtils.isNotBlank(citation.getPublication())) {
			writer.append(TITLE_2).append(RIS_SEPARATOR).append(citation.getPublication()).append(RIS_EOL);
			writer.append(JOURNAL_2).append(RIS_SEPARATOR).append(citation.getPublication()).append(RIS_EOL);
		}

		if (StringUtils.isNotBlank(citation.getLanguage())) {
			writer.append(LANGUAGE).append(RIS_SEPARATOR).append(citation.getLanguage()).append(RIS_EOL);
		}

		if (StringUtils.isNotBlank(citation.getSerialNumber())) {
			writer.append(SERIAL_NUMBER).append(RIS_SEPARATOR).append(citation.getSerialNumber()).append(RIS_EOL);
		}

		if (StringUtils.isNotBlank(citation.getVolume())) {
			writer.append(VOLUME).append(RIS_SEPARATOR).append(citation.getVolume()).append(RIS_EOL);
		}

		if (StringUtils.isNotBlank(citation.getPublisher())) {
			writer.append(PUBLISHER).append(RIS_SEPARATOR).append(citation.getPublisher()).append(RIS_EOL);
		}

		if (citation.getAuthors() != null) {
			citation.getAuthors().forEach(author -> {
				try {
					writer.append(AUTHOR).append(RIS_SEPARATOR).append(author.getLastName());
					if (StringUtils.isNotBlank(author.getOtherNames())) writer.append(", ").append(author.getOtherNames());
					writer.append(RIS_EOL);
				} catch (IOException e) {
					log.warn("Invalid authors data", e);
				}
			});
		}

		if (citation.getUrls() != null) {
			citation.getUrls().forEach(url -> {
				try {
					writer.append(URL).append(RIS_SEPARATOR).append(url).append(RIS_EOL);
				} catch (IOException e) {
					log.warn("Invalid urls data", e);
				}
			});
		}

		if (StringUtils.isNotBlank(citation.getDownloadUrl())) {
			writer.append(URL_1).append(RIS_SEPARATOR).append(citation.getDownloadUrl()).append(RIS_EOL);
		}

		if (citation.getKeywords() != null) {
			citation.getKeywords().forEach(keyword -> {
				try {
					writer.append(KEYWORD).append(RIS_SEPARATOR).append(keyword).append(RIS_EOL);
				} catch (IOException e) {
					log.warn("Invalid keyword data", e);
				}
			});
		}

		if (StringUtils.isNotBlank(citation.getAbstractText())) {
			writer.append(ABSTRACT).append(RIS_SEPARATOR).append(citation.getAbstractText()).append(RIS_EOL);
		}

		writer.append(END_OF_REFERENCE).append(RIS_SEPARATOR).append(RIS_EOL);
		return writer;
	}
}