CitationRisConverter.java
/*
* Copyright 2024 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.server.service.worker.bib;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.genesys.server.api.v2.model.bib.AuthorDTO;
import org.genesys.server.api.v2.model.bib.CitationDTO;
import org.genesys.server.model.bib.CitationType;
import org.springframework.stereotype.Component;
import lombok.extern.slf4j.Slf4j;
/**
* RIS is a standard file format for citations.
*/
@Component
@Slf4j
public class CitationRisConverter {
private static final String RIS_SEPARATOR = " - ";
private static final String RIS_EOL = "\r\n";
public static String TYPE_OF_REFERENCE = "TY";
public static String END_OF_REFERENCE = "ER";
public static String TITLE = "TI";
public static String TITLE_1 = "T1";
public static String TITLE_2 = "T2"; // Commonly the name of journal, periodical
public static String TRANSLATED_TITLE = "TT";
public static String LANGUAGE = "LA";
public static String KEYWORD = "KW";
public static String KEYWORD_1 = "K1";
public static String PUBLICATION_YEAR = "PY";
public static String PUBLICATION_YEAR_1 = "YR";
public static String URL = "UR";
public static String URL_1 = "L1"; // Link to PDF. These links should end with a file name, and not simply a landing page.
public static String URL_2 = "L2"; // Link to full-text.
public static String AUTHOR = "AU";
public static String DOI = "DO";
public static String ABSTRACT = "AB";
public static String NOTE_1 = "N1";
public static String NOTE_2 = "N2";
public static String JOURNAL_FULL = "JF";
public static String JOURNAL_FULL_1 = "JO";
public static String JOURNAL = "JA";
public static String JOURNAL_2 = "J2";
public static String PUBLISHER = "PB";
public static String SERIAL_NUMBER = "SN";
public static String VOLUME = "VL";
public static String ISSUE = "IS";
public static String START_PAGE = "SP";
public static String END_PAGE = "EP";
private final Pattern RIS_PATTERN = Pattern.compile("([A-Z][A-Z\\d]) - ?([^$]+)?");
private final Pattern RIS_HTML_ITALIC = Pattern.compile("<em>|</em>|<i>|</i>", Pattern.CASE_INSENSITIVE);
private final Pattern RIS_HTML_BOLD = Pattern.compile("<b>|</b>", Pattern.CASE_INSENSITIVE);
/**
* Read citations in RIS format
*
* @param input RIS data
* @return Stream of citations
* @throws IOException
* @see <a href="https://en.wikipedia.org/wiki/RIS_(file_format)">RIS_(file_format)</a>
*/
public Stream<CitationDTO> readCitations(Reader input) throws IOException {
Stream.Builder<CitationDTO> stream = Stream.builder();
var reader = new BufferedReader(input, 1024);
while (true) {
var citation = readCitation(reader);
if (citation != null) {
stream.add(citation);
} else {
break;
}
}
return stream.build();
}
public CitationDTO readCitation(BufferedReader reader) throws IOException {
CitationDTO citation = null;
String line = null;
StringBuffer source = null;
while ((line = reader.readLine()) != null) {
log.trace("<< {}", line);
var ris = RIS_PATTERN.matcher(line);
if (!ris.matches()) {
log.debug("Skipping non-RIS line: {}", line);
continue;
}
if (source != null) source.append(line).append(RIS_EOL);
// if (StringUtils.isBlank(line)) continue;
String tag = ris.group(1);
String data = ris.group(2);
log.debug("RIS {}: {}", tag, data);
if (Objects.equals(END_OF_REFERENCE, tag)) {
if (citation != null) citation.setSourceData(source.toString());
log.debug("Created {}", citation);
return citation;
}
data = StringUtils.trimToEmpty(data);
if (StringUtils.isBlank(data)) continue; // Skip tag without data
{
// Cleanup
if (data.contains("<")) { // Process allowed HTML tags into Markdown
data = RIS_HTML_ITALIC.matcher(data).replaceAll("*");
data = RIS_HTML_BOLD.matcher(data).replaceAll("**");
}
if (data.contains("\t")) { // Replace tab with space
data = data.replaceAll("\\t", " ");
}
data = data.replaceAll("\\s\\s+", " "); // Remove double whitespace
}
if (Objects.equals(TYPE_OF_REFERENCE, tag)) {
source = new StringBuffer(1024);
source.append(line).append(RIS_EOL); // Add start line
citation = new CitationDTO(); // new citation
citation.setType(CitationType.forValue(data));
log.info("New citation of type {}", citation.getType());
}
if (citation == null) continue; // Don't read, we don't have a target
if (Objects.equals(TITLE, tag)) {
citation.setTitle(data);
} else if (Objects.equals(TITLE_1, tag) && StringUtils.isBlank(citation.getTitle())) {
citation.setTitle(data);
} else if (Objects.equals(DOI, tag)) {
citation.setDoi(data);
} else if (Objects.equals(PUBLICATION_YEAR, tag) || Objects.equals(PUBLICATION_YEAR_1, tag)) {
try {
citation.setPublicationYear(Integer.parseInt(data));
} catch (NumberFormatException e) {
log.warn("Invalid publication year {}", data);
}
} else if (Objects.equals(AUTHOR, tag)) {
if (citation.getAuthors() == null) citation.setAuthors(new ArrayList<>());
var author = new AuthorDTO();
var names = data.split(" ?,\\s*", 2);
author.setLastName(StringUtils.trimToNull(names[0]));
if (names.length > 1) author.setOtherNames(StringUtils.trimToNull(names[1]));
citation.getAuthors().add(author);
} else if (Objects.equals(KEYWORD, tag) || Objects.equals(KEYWORD_1, tag)) {
if (citation.getKeywords() == null) citation.setKeywords(new ArrayList<>());
citation.getKeywords().add(data);
} else if (Objects.equals(URL, tag) || Objects.equals(URL_1, tag) || Objects.equals(URL_2, tag)) {
if (citation.getUrls() == null) citation.setUrls(new ArrayList<>());
var urls = data.split(";");
if (urls.length > 0) {
Arrays.stream(urls).map(StringUtils::trimToNull).filter(Objects::nonNull).forEach(citation.getUrls()::add);
if (Objects.equals(URL_1, tag) && citation.getDownloadUrl() == null)
citation.setDownloadUrl(StringUtils.trimToNull(urls[0]));
} else {
log.debug("No URLs in {}", data);
}
} else if (Objects.equals(ABSTRACT, tag)) {
citation.setAbstractText(readMultiline(reader, data, source));
} else if ((Objects.equals(NOTE_1, tag) || Objects.equals(NOTE_2, tag)) && StringUtils.isBlank(citation.getAbstractText())) {
citation.setAbstractText(readMultiline(reader, data, source));
} else if (Objects.equals(PUBLISHER, tag)) {
citation.setPublisher(data);
} else if (Objects.equals(JOURNAL_FULL, tag) || Objects.equals(JOURNAL_FULL_1, tag)) {
citation.setPublication(data);
} else if (Objects.equals(TITLE_2, tag) || Objects.equals(JOURNAL, tag) || Objects.equals(JOURNAL_2, tag)) {
if (citation.getPublication() == null) citation.setPublication(data);
} else if (Objects.equals(LANGUAGE, tag)) {
citation.setLanguage(data);
} else if (Objects.equals(SERIAL_NUMBER, tag)) {
citation.setSerialNumber(data);
} else if (Objects.equals(VOLUME, tag)) {
citation.setVolume(data);
} else if (Objects.equals(ISSUE, tag)) {
citation.setIssue(data);
} else if (Objects.equals(START_PAGE, tag)) {
citation.setPages(data);
} else if (Objects.equals(END_PAGE, tag)) {
if (citation.getPages() != null) {
citation.setPages(citation.getPages() + "-" + data);
}
}
}
return null; // We didn't reach end-of-record "ER" tag
}
/**
* Attempt to read multi-line text by checking if the next line is a RIS command or not.
* Concatenate the lines until a RIS command is reached.
*
* @param reader source reader
* @param initialData first line
* @return multi-line text
* @throws IOException
*/
private String readMultiline(BufferedReader reader, String initialData, StringBuffer source) throws IOException {
StringBuilder sb = new StringBuilder(initialData);
// multiline support
char[] cbuf = new char[5];
while (true) {
reader.mark(10);
if (reader.read(cbuf) > 0) {
var nextPreview = new String(cbuf);
reader.reset();
if (RIS_PATTERN.matcher(nextPreview).matches()) {
log.trace("Next line is a RIS command: {}", nextPreview);
break;
} else {
var line = reader.readLine();
sb.append("\n").append(line);
source.append(line).append("\n");
}
} else {
break;
}
}
return sb.toString();
}
/**
* Write citation in RIS format
*
* @param citation the citation to export in RIS
* @return citation in RIS format
*/
public Writer writeCitation(CitationDTO citation, Writer writer) throws IOException {
writer.append(TYPE_OF_REFERENCE).append(RIS_SEPARATOR).append(citation.getType().getCode()).append(RIS_EOL);
writer.append(TITLE).append(RIS_SEPARATOR).append(citation.getTitle()).append(RIS_EOL);
if (StringUtils.isNotBlank(citation.getDoi())) {
writer.append(DOI).append(RIS_SEPARATOR).append(citation.getDoi()).append(RIS_EOL);
}
if (citation.getPublicationYear() != null) {
writer.append(PUBLICATION_YEAR).append(RIS_SEPARATOR).append(citation.getPublicationYear().toString()).append(RIS_EOL);
}
if (StringUtils.isNotBlank(citation.getPublication())) {
writer.append(TITLE_2).append(RIS_SEPARATOR).append(citation.getPublication()).append(RIS_EOL);
writer.append(JOURNAL_2).append(RIS_SEPARATOR).append(citation.getPublication()).append(RIS_EOL);
}
if (StringUtils.isNotBlank(citation.getLanguage())) {
writer.append(LANGUAGE).append(RIS_SEPARATOR).append(citation.getLanguage()).append(RIS_EOL);
}
if (StringUtils.isNotBlank(citation.getSerialNumber())) {
writer.append(SERIAL_NUMBER).append(RIS_SEPARATOR).append(citation.getSerialNumber()).append(RIS_EOL);
}
if (StringUtils.isNotBlank(citation.getVolume())) {
writer.append(VOLUME).append(RIS_SEPARATOR).append(citation.getVolume()).append(RIS_EOL);
}
if (StringUtils.isNotBlank(citation.getPublisher())) {
writer.append(PUBLISHER).append(RIS_SEPARATOR).append(citation.getPublisher()).append(RIS_EOL);
}
if (citation.getAuthors() != null) {
citation.getAuthors().forEach(author -> {
try {
writer.append(AUTHOR).append(RIS_SEPARATOR).append(author.getLastName());
if (StringUtils.isNotBlank(author.getOtherNames())) writer.append(", ").append(author.getOtherNames());
writer.append(RIS_EOL);
} catch (IOException e) {
log.warn("Invalid authors data", e);
}
});
}
if (citation.getUrls() != null) {
citation.getUrls().forEach(url -> {
try {
writer.append(URL).append(RIS_SEPARATOR).append(url).append(RIS_EOL);
} catch (IOException e) {
log.warn("Invalid urls data", e);
}
});
}
if (StringUtils.isNotBlank(citation.getDownloadUrl())) {
writer.append(URL_1).append(RIS_SEPARATOR).append(citation.getDownloadUrl()).append(RIS_EOL);
}
if (citation.getKeywords() != null) {
citation.getKeywords().forEach(keyword -> {
try {
writer.append(KEYWORD).append(RIS_SEPARATOR).append(keyword).append(RIS_EOL);
} catch (IOException e) {
log.warn("Invalid keyword data", e);
}
});
}
if (StringUtils.isNotBlank(citation.getAbstractText())) {
writer.append(ABSTRACT).append(RIS_SEPARATOR).append(citation.getAbstractText()).append(RIS_EOL);
}
writer.append(END_OF_REFERENCE).append(RIS_SEPARATOR).append(RIS_EOL);
return writer;
}
}