Taxonomy2GRINMatcher.java
package org.genesys.server.service.worker;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.genesys.server.model.genesys.Taxonomy2;
import org.genesys.server.model.grin.TaxonomySpecies;
import org.genesys.server.persistence.Taxonomy2Repository;
import org.genesys.server.persistence.grin.TaxonomyGenusRepository;
import org.genesys.server.persistence.grin.TaxonomySpeciesRepository;
import org.genesys.spring.TransactionHelper;
import org.genesys.taxonomy.checker.CachingInMemoryTaxonomyDatabase;
import org.genesys.taxonomy.checker.InMemoryTaxonomyDatabase;
import org.genesys.taxonomy.checker.StringSimilarity;
import org.genesys.taxonomy.checker.TaxonomyChecker;
import org.genesys.taxonomy.checker.TaxonomyDatabase;
import org.genesys.taxonomy.checker.TaxonomyException;
import org.genesys.taxonomy.gringlobal.model.IGrinSpecies;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import lombok.extern.slf4j.Slf4j;
@Component
@Slf4j
public class Taxonomy2GRINMatcher {
@Autowired
private TaxonomyGenusRepository taxonomyGenusRepository;
@Autowired
private TaxonomySpeciesRepository taxonomySpeciesRepository;
@Autowired
private Taxonomy2Repository taxonomy2Repository;
@Transactional(readOnly = true)
public void update() throws Exception {
InMemoryTaxonomyDatabase taxonomyDatabase = new CachingInMemoryTaxonomyDatabase();
readDatabase(taxonomyDatabase);
log.warn("Loaded GRIN Taxonomy to memory");
TaxonomyChecker taxonomyChecker = new TaxonomyChecker();
taxonomyChecker.setTaxonomyDatabase(taxonomyDatabase);
run(taxonomyChecker, taxonomyDatabase);
log.warn("Done matching Taxonomy2 to GRIN TaxonomySpecies.");
}
/**
* Read database.
*/
private void readDatabase(final InMemoryTaxonomyDatabase taxonomyDatabase) {
log.info("Loading GRIN TaxonomyGenus");
// read taxonomy_genus
taxonomyGenusRepository.findAll().forEach(genus -> {
taxonomyDatabase.registerGenus(genus.getId(), genus.getGenusName());
});
log.info("Loading GRIN TaxonomySpecies");
// read taxonomy_species
taxonomySpeciesRepository.findAll().forEach(species -> {
try {
taxonomyDatabase.registerSpecies(species);
} catch (TaxonomyException e) {
log.warn("Error registering GRIN Species {}", e.getMessage(), e);
}
});
}
private void run(TaxonomyChecker taxonomyChecker, TaxonomyDatabase taxonomyDatabase) throws Exception {
int PAGE_SIZE = 1000;
int page = 0;
Page<Taxonomy2> taxa = null;
do {
PageRequest pagination = PageRequest.of(page++, PAGE_SIZE, Sort.by("id"));
taxa = taxonomy2Repository.findAll(pagination);
if (taxa.isEmpty()) {
break;
}
log.info("Processing page {}/{}", taxa.getNumber(), taxa.getTotalPages());
for (Taxonomy2 taxonomy : taxa) {
String genus = taxonomy.getGenus();
String species = taxonomy.getSpecies();
var spAuthor = StringUtils.trimToNull(taxonomy.getSpAuthor());
String subtaxa = taxonomy.getSubtaxa();
{
// Sanitize common issues
if (species.toLowerCase().startsWith(genus.toLowerCase() + " ")) {
log.debug("Assuming {} '{}' is '{}'", genus, species, species.substring(genus.length()).trim());
species = species.substring(genus.length()).trim();
}
if (genus.length() > 0 && species.toLowerCase().startsWith(genus.toLowerCase().substring(0, 1) + ".")) {
log.debug("Assuming {} '{}' is '{}'", genus, species, species.substring(2).trim());
species = species.substring(2).trim();
}
if (species.equals("sp.")) {
species = "spp."; // GRIN Taxonomy convention
}
}
/*@formatter:off*/
// {
// final String genus_check = taxonomyChecker.suggestGenus(genus, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
// : prev + ";" + suggestion);
//
// if (LOG.isTraceEnabled()) {
// LOG.trace("GENUS_check: {}", StringUtils.defaultIfBlank(genus_check, "NULL"));
// }
//
// // outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_GENUS_CHECK)] =
// // StringUtils.equals(genus, genus_check) ? ApplicationUtils.CHECK_PASSED :
// // genus_check;
// }
//
// {
// final String species_check = taxonomyChecker.suggestSpecies(genus, species, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
// : prev + ";" + suggestion);
//
// if (LOG.isTraceEnabled()) {
// LOG.trace("SPECIES_check: {}", StringUtils.defaultIfBlank(species_check, "NULL"));
// }
//
// // outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_SPECIES_CHECK)] =
// // StringUtils.equals(species, species_check) ? ApplicationUtils.CHECK_PASSED :
// // species_check;
// }
//
// {
// final String subtaxa_check = taxonomyChecker.suggestSubtaxa(genus, species, subtaxa, 5).stream().reduce(null, (prev, suggestion) -> prev == null ? suggestion
// : prev + ";" + suggestion);
//
// if (LOG.isTraceEnabled()) {
// LOG.trace("SPECIES_check: {}", StringUtils.defaultIfBlank(subtaxa_check, "NULL"));
// }
//
// // outputLine[outputHeaders.indexOf(ApplicationUtils.HEADER_SUBTAXA_CHECK)] =
// // StringUtils.equals(subtaxa, subtaxa_check) ? ApplicationUtils.CHECK_PASSED :
// // subtaxa_check;
// }
/*@formatter:on*/
{
if (taxonomy.getOverrideTaxonomySpecies() != null) {
TaxonomySpecies overrideSpecies = taxonomy.getOverrideTaxonomySpecies();
// Update current name
taxonomy.setGrinTaxonomySpecies(overrideSpecies);
taxonomy.setCurrentTaxonomySpecies(overrideSpecies.getCurrentTaxonomySpecies());
}
final List<IGrinSpecies> speciesRows = taxonomyDatabase.findSpeciesRow(genus, species, StringUtils.defaultIfBlank(subtaxa, ""));
if (speciesRows.size() == 1) {
maybeUpdateTaxonomySpecies(taxonomy, speciesRows.get(0));
} else if (speciesRows.size() == 0) {
log.trace("No speciesRows match genus={} species={} subtaxa={}", genus, species, subtaxa);
if (taxonomy.getOverrideTaxonomySpecies() == null) {
// Set GRIN species to null
taxonomy.setGrinTaxonomySpecies(null);
taxonomy.setCurrentTaxonomySpecies(null);
}
} else {
var matchAuthorityRows = speciesRows.stream().filter(ts -> StringUtils.equalsIgnoreCase(ts.getSpeciesAuthority(), spAuthor) || stringSimilarity(ts.getSpeciesAuthority(), spAuthor) > 0.8).collect(Collectors.toList());
var currentSpeciesRows = speciesRows.stream().filter(ts -> ts.isCurrent()).collect(Collectors.toList());
if (matchAuthorityRows.size() == 1) {
maybeUpdateTaxonomySpecies(taxonomy, matchAuthorityRows.get(0));
} else if (currentSpeciesRows.size() == 1) {
maybeUpdateTaxonomySpecies(taxonomy, currentSpeciesRows.get(0));
} else {
if (log.isInfoEnabled()) {
log.info("Multiple {} ({} current) speciesRows match genus={} species={} auth={} subtaxa={}: {}", speciesRows.size(), currentSpeciesRows.size(), genus, species, spAuthor, subtaxa,
speciesRows.stream().map(ts -> ts.getSpeciesId() + " " + ts.getName() + " auth=" + ts.getSpeciesAuthority() + " sub=" + ts.getSubtaxa() + " curr=" + ts.isCurrent()).collect(Collectors.toSet())
);
}
if (taxonomy.getOverrideTaxonomySpecies() == null) {
// Set GRIN species to null
taxonomy.setGrinTaxonomySpecies(null);
taxonomy.setCurrentTaxonomySpecies(null);
}
}
}
}
}
final ArrayList<Taxonomy2> toSave = new ArrayList<>(taxa.getContent());
TransactionHelper.updateAsCurrentUser(() -> {
return taxonomy2Repository.saveAll(toSave);
});
} while (!taxa.isEmpty());
}
private static final double stringSimilarity(String original, String candidate) {
if (original == null || candidate == null || original.length() == 0 || candidate.length() == 0) {
return 0;
}
var score = (StringSimilarity.diceCoefficientOptimized(original.toLowerCase(), candidate.toLowerCase()) + StringSimilarity.getLevenshteinCoefficient(original.toLowerCase(),
candidate.toLowerCase())) / 2.0f;
return score;
}
private void maybeUpdateTaxonomySpecies(Taxonomy2 taxonomy, IGrinSpecies speciesRow) {
if (taxonomy.getOverrideTaxonomySpecies() == null || speciesRow.getSpeciesId().equals(taxonomy.getOverrideTaxonomySpecies().getId())) {
taxonomy.setGrinTaxonomySpecies(new TaxonomySpecies(speciesRow.getSpeciesId()));
if (speciesRow.getCurrentTaxonomySpeciesId() == null) {
log.debug("speciesRow.getCurrentTaxonomySpeciesId is null!");
taxonomy.setCurrentTaxonomySpecies(new TaxonomySpecies(speciesRow.getSpeciesId()));
} else {
taxonomy.setCurrentTaxonomySpecies(new TaxonomySpecies(speciesRow.getCurrentTaxonomySpeciesId()));
}
taxonomy.setOverrideTaxonomySpecies(null);
} else {
// Don't touch!
log.warn("Not updating {} because of our override taxonomy_species_id={} {} {} found={} {}",
taxonomy.getTaxonName(),
taxonomy.getOverrideTaxonomySpecies().getId(),
taxonomy.getOverrideTaxonomySpecies().getName(), taxonomy.getOverrideTaxonomySpecies().getNameAuthority(),
speciesRow.getSpeciesId(),
speciesRow.getName()
);
}
}
}