Taxonomy2GRINMatcher.java

package org.genesys.server.service.worker;

import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.genesys.server.model.genesys.Taxonomy2;
import org.genesys.server.model.grin.TaxonomySpecies;
import org.genesys.server.persistence.Taxonomy2Repository;
import org.genesys.server.persistence.grin.TaxonomyGenusRepository;
import org.genesys.server.persistence.grin.TaxonomySpeciesRepository;
import org.genesys.spring.TransactionHelper;
import org.genesys.taxonomy.checker.CachingInMemoryTaxonomyDatabase;
import org.genesys.taxonomy.checker.InMemoryTaxonomyDatabase;
import org.genesys.taxonomy.checker.StringSimilarity;
import org.genesys.taxonomy.checker.TaxonomyChecker;
import org.genesys.taxonomy.checker.TaxonomyDatabase;
import org.genesys.taxonomy.checker.TaxonomyException;
import org.genesys.taxonomy.gringlobal.model.IGrinSpecies;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

import lombok.extern.slf4j.Slf4j;

@Component
@Slf4j
public class Taxonomy2GRINMatcher {

	@Autowired
	private TaxonomyGenusRepository taxonomyGenusRepository;

	@Autowired
	private TaxonomySpeciesRepository taxonomySpeciesRepository;

	@Autowired
	private Taxonomy2Repository taxonomy2Repository;

	@Transactional(readOnly = true)
	public void update() throws Exception {
		InMemoryTaxonomyDatabase taxonomyDatabase = new CachingInMemoryTaxonomyDatabase();
		readDatabase(taxonomyDatabase);
		log.warn("Loaded GRIN Taxonomy to memory");

		TaxonomyChecker taxonomyChecker = new TaxonomyChecker();
		taxonomyChecker.setTaxonomyDatabase(taxonomyDatabase);

		run(taxonomyChecker, taxonomyDatabase);
		log.warn("Done matching Taxonomy2 to GRIN TaxonomySpecies.");
	}

	/**
	 * Read database.
	 */
	private void readDatabase(final InMemoryTaxonomyDatabase taxonomyDatabase) {
		log.info("Loading GRIN TaxonomyGenus");
		// read taxonomy_genus
		taxonomyGenusRepository.findAll().forEach(genus -> {
			taxonomyDatabase.registerGenus(genus.getId(), genus.getGenusName());
		});

		log.info("Loading GRIN TaxonomySpecies");
		// read taxonomy_species
		taxonomySpeciesRepository.findAll().forEach(species -> {
			try {
				taxonomyDatabase.registerSpecies(species);
			} catch (TaxonomyException e) {
				log.warn("Error registering GRIN Species {}", e.getMessage(), e);
			}
		});
	}

	private void run(TaxonomyChecker taxonomyChecker, TaxonomyDatabase taxonomyDatabase) throws Exception {

		int PAGE_SIZE = 1000;
		int page = 0;
		Page<Taxonomy2> taxa = null;

		do {
			PageRequest pagination = PageRequest.of(page++, PAGE_SIZE, Sort.by("id"));
			taxa = taxonomy2Repository.findAll(pagination);

			if (taxa.isEmpty()) {
				break;
			}
			log.info("Processing page {}/{}", taxa.getNumber(), taxa.getTotalPages());

			for (Taxonomy2 taxonomy : taxa) {

				String genus = taxonomy.getGenus();
				String species = taxonomy.getSpecies();
				var spAuthor = StringUtils.trimToNull(taxonomy.getSpAuthor());
				String subtaxa = taxonomy.getSubtaxa();

				{
					// Sanitize common issues
					if (species.toLowerCase().startsWith(genus.toLowerCase() + " ")) {
						log.debug("Assuming {} '{}' is '{}'", genus, species, species.substring(genus.length()).trim());
						species = species.substring(genus.length()).trim();
					}
					if (genus.length() > 0 && species.toLowerCase().startsWith(genus.toLowerCase().substring(0, 1) + ".")) {
						log.debug("Assuming {} '{}' is '{}'", genus, species, species.substring(2).trim());
						species = species.substring(2).trim();
					}

					if (species.equals("sp.")) {
						species = "spp."; // GRIN Taxonomy convention
					}
				}

				/*@formatter:off*/
//				{
//					final String genus_check = taxonomyChecker.suggestGenus(genus, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
//							: prev + ";" + suggestion);
//
//					if (LOG.isTraceEnabled()) {
//						LOG.trace("GENUS_check: {}", StringUtils.defaultIfBlank(genus_check, "NULL"));
//					}
//
//					// outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_GENUS_CHECK)] =
//					// StringUtils.equals(genus, genus_check) ? ApplicationUtils.CHECK_PASSED :
//					// genus_check;
//				}
//
//				{
//					final String species_check = taxonomyChecker.suggestSpecies(genus, species, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
//							: prev + ";" + suggestion);
//
//					if (LOG.isTraceEnabled()) {
//						LOG.trace("SPECIES_check: {}", StringUtils.defaultIfBlank(species_check, "NULL"));
//					}
//
//					// outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_SPECIES_CHECK)] =
//					// StringUtils.equals(species, species_check) ? ApplicationUtils.CHECK_PASSED :
//					// species_check;
//				}
//
//				{
//					final String subtaxa_check = taxonomyChecker.suggestSubtaxa(genus, species, subtaxa, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
//							: prev + ";" + suggestion);
//
//					if (LOG.isTraceEnabled()) {
//						LOG.trace("SPECIES_check: {}", StringUtils.defaultIfBlank(subtaxa_check, "NULL"));
//					}
//
//					// outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_SUBTAXA_CHECK)] =
//					// StringUtils.equals(subtaxa, subtaxa_check) ? ApplicationUtils.CHECK_PASSED :
//					// subtaxa_check;
//				}
				/*@formatter:on*/

				{
					if (taxonomy.getOverrideTaxonomySpecies() != null) {
						TaxonomySpecies overrideSpecies = taxonomy.getOverrideTaxonomySpecies();
						// Update current name
						taxonomy.setGrinTaxonomySpecies(overrideSpecies);
						taxonomy.setCurrentTaxonomySpecies(overrideSpecies.getCurrentTaxonomySpecies());
					}

					final List<IGrinSpecies> speciesRows = taxonomyDatabase.findSpeciesRow(genus, species, StringUtils.defaultIfBlank(subtaxa, ""));

					if (speciesRows.size() == 1) {
						maybeUpdateTaxonomySpecies(taxonomy, speciesRows.get(0));

					} else if (speciesRows.size() == 0) {
						log.trace("No speciesRows match genus={} species={} subtaxa={}", genus, species, subtaxa);

						if (taxonomy.getOverrideTaxonomySpecies() == null) {
							// Set GRIN species to null
							taxonomy.setGrinTaxonomySpecies(null);
							taxonomy.setCurrentTaxonomySpecies(null);
						}
					} else {
						var matchAuthorityRows = speciesRows.stream().filter(ts -> StringUtils.equalsIgnoreCase(ts.getSpeciesAuthority(), spAuthor) || stringSimilarity(ts.getSpeciesAuthority(), spAuthor) > 0.8).collect(Collectors.toList());
						var currentSpeciesRows = speciesRows.stream().filter(ts -> ts.isCurrent()).collect(Collectors.toList());

						if (matchAuthorityRows.size() == 1) {
							maybeUpdateTaxonomySpecies(taxonomy, matchAuthorityRows.get(0));
						} else if (currentSpeciesRows.size() == 1) {
							maybeUpdateTaxonomySpecies(taxonomy, currentSpeciesRows.get(0));
						} else {
							if (log.isInfoEnabled()) {
								log.info("Multiple {} ({} current) speciesRows match genus={} species={} auth={} subtaxa={}: {}", speciesRows.size(), currentSpeciesRows.size(), genus, species, spAuthor, subtaxa,
									speciesRows.stream().map(ts -> ts.getSpeciesId() + " " + ts.getName() + " auth=" + ts.getSpeciesAuthority() + " sub=" + ts.getSubtaxa() + " curr=" + ts.isCurrent()).collect(Collectors.toSet())
								);
							}
							if (taxonomy.getOverrideTaxonomySpecies() == null) {
								// Set GRIN species to null
								taxonomy.setGrinTaxonomySpecies(null);
								taxonomy.setCurrentTaxonomySpecies(null);
							}
						}
					}
				}
			}

			final ArrayList<Taxonomy2> toSave = new ArrayList<>(taxa.getContent());
			TransactionHelper.updateAsCurrentUser(() -> {
				return taxonomy2Repository.saveAll(toSave);
			});

		} while (!taxa.isEmpty());
	}

	private static final double stringSimilarity(String original, String candidate) {
		if (original == null || candidate == null || original.length() == 0 || candidate.length() == 0) {
			return 0;
		}
		var score = (StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase()) + StringSimilarity.getLevenshteinCoefficient(original.toLowerCase(),
			candidate.toLowerCase())) / 2.0f;
		return score;
	}

	private void maybeUpdateTaxonomySpecies(Taxonomy2 taxonomy, IGrinSpecies speciesRow) {
		if (taxonomy.getOverrideTaxonomySpecies() == null || speciesRow.getSpeciesId().equals(taxonomy.getOverrideTaxonomySpecies().getId())) {
			taxonomy.setGrinTaxonomySpecies(new TaxonomySpecies(speciesRow.getSpeciesId()));
			if (speciesRow.getCurrentTaxonomySpeciesId() == null) {
				log.debug("speciesRow.getCurrentTaxonomySpeciesId is null!");
				taxonomy.setCurrentTaxonomySpecies(new TaxonomySpecies(speciesRow.getSpeciesId()));
			} else {
				taxonomy.setCurrentTaxonomySpecies(new TaxonomySpecies(speciesRow.getCurrentTaxonomySpeciesId()));
			}
			taxonomy.setOverrideTaxonomySpecies(null);
		} else {
			// Don't touch!
			log.warn("Not updating {} because of our override taxonomy_species_id={} {} {} found={} {}",
				taxonomy.getTaxonName(),
				taxonomy.getOverrideTaxonomySpecies().getId(),
				taxonomy.getOverrideTaxonomySpecies().getName(), taxonomy.getOverrideTaxonomySpecies().getNameAuthority(),
				speciesRow.getSpeciesId(),
				speciesRow.getName()
			);
		}
	}
}