AccessionDuplicateFinder.java
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.server.service.worker.dupe;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.DoubleAdder;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.NumberFilter;
import org.genesys.server.model.genesys.Accession;
import org.genesys.server.model.genesys.Taxonomy2;
import org.genesys.server.model.genesys.AccessionAlias.AliasType;
import org.genesys.server.service.AccessionService;
import org.genesys.server.service.filter.AccessionFilter;
import org.genesys.server.exception.SearchException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
/**
* Accession Duplicate Finder.
*/
@Component
public class AccessionDuplicateFinder extends DuplicateFinder<Accession, AccessionFilter> {
@Autowired
private AccessionService accessionService;
@Override
protected double getBestScoreThreshold() {
return 1000d;
}
@Override
protected List<Accession> getCandidates(Accession target, Collection<Long> excludedById, AccessionFilter additionalFilter) {
assert (target != null);
LOG.info("Searching for duplicates of {}", target.toString());
List<Accession> candidates = new ArrayList<>(100);
Taxonomy2 taxonomy = target.getTaxonomy();
Set<String> genusSet = new HashSet<>();
genusSet.add(taxonomy.getGenus());
if (taxonomy.getCurrentTaxonomySpecies() != null) {
genusSet.add(taxonomy.getCurrentTaxonomySpecies().getTaxonomyGenus().getName());
}
if (taxonomy.getGrinTaxonomySpecies() != null) {
genusSet.add(taxonomy.getGrinTaxonomySpecies().getTaxonomyGenus().getName());
}
// By donor
String aDonorNumb = target.getDonorNumb();
if (StringUtils.isNotBlank(aDonorNumb)) {
var aDonorNumbSplit = spaceStringsAndNumbers(aDonorNumb);
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
// if (StringUtils.isNotBlank(accession.getDonorCode())) {
// filter.institute().code = Set.of(accession.getDonorCode());
// }
// By donor accession number
if (StringUtils.isNotBlank(aDonorNumb)) {
filter.accessionNumbers().add(aDonorNumb);
filter.accessionNumbers().add(aDonorNumbSplit);
try {
LOG.info("Filtering for {}", filter.toString());
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 10));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// By donor accession number
if (StringUtils.isNotBlank(aDonorNumbSplit)) {
filter.accessionNumbers().clear();
filter
.institute(null)
._text(toSafeEsQuery(aDonorNumbSplit + " " + StringUtils.defaultIfBlank(target.getDonorCode(), "")));
try {
LOG.info("Filtering for donor number {}", filter.toString());
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
}
// By genus and accession name
if (StringUtils.isNotBlank(target.getAccessionName())) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
filter.taxa().genus(genusSet);
// filter.taxa().species = Set.of(accession.getTaxonomy().getSpecies());
if (StringUtils.isNotBlank(target.getAccessionName())) {
filter._text(toStringsAndNumbers(target.getAccessionName()).stream()
// proximity search for strings
.map((part) -> {
if (part instanceof Number) {
return part.toString();
}
return toSafeEsQuery(part.toString()) + "~";
})
// join
.collect(Collectors.joining(" ")))
;
try {
LOG.info("Filtering for accession name {}", filter.toString());
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
}
{
// By genus and other IDs
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
filter.taxa().genus(genusSet);
if (target.getAccessionId().getAliases().size() > 0) {
// filter.taxa().species = Set.of(accession.getTaxonomy().getSpecies());
filter._text(target.getAccessionId().getAliases().stream()
// Remove
// .filter((alias) -> alias.getAliasType() == AliasType.OTHERNUMB)
// Just names
.map((alias) -> alias.getName())
// remove short stuff
.filter((name) -> name != null && name.length() > 3)
// Strings and numbers
.map((name) -> spaceStringsAndNumbers(name))
// cleanup
.map((name) -> toSafeEsQuery(name))
// ES search string
.collect(Collectors.joining(" | ")))
;
filter._text(filter._text += " | " + toSafeEsQuery(target.getAccessionNumber()));
} else {
filter._text(target.getAccessionNumber());
}
if (StringUtils.isNotBlank(filter._text)) {
try {
LOG.info("Filtering for aliases {}", filter.toString());
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
}
// By genus and coordinates
if (target.getAccessionId().getLatitude() != null && target.getAccessionId().getLongitude() != null) {
AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
filter.taxa().genus(Set.of(taxonomy.getGenus()));
float geoFact = 1.01f; // 1%
filter.geo()
.latitude(new NumberFilter<Double>(target.getAccessionId().getLatitude() / geoFact, target.getAccessionId().getLatitude() * geoFact))
.longitude(new NumberFilter<Double>(target.getAccessionId().getLongitude() / geoFact, target.getAccessionId().getLongitude() * geoFact));
try {
LOG.info("Filtering for {}", filter.toString());
Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
return candidates;
}
private AccessionFilter getCandidatesFilter(Accession target, Collection<Long> excludedById, List<Accession> candidates, AccessionFilter additionalFilter) {
AccessionFilter filter = new AccessionFilter(null);
filter.NOT(new AccessionFilter(null));
filter.NOT.id(new HashSet<>());
filter.NOT.id().add(target.getId());
if (! CollectionUtils.isEmpty(excludedById)) {
filter.NOT.id().addAll(excludedById);
}
filter.NOT.id().addAll(candidates.stream().map(Accession::getId).collect(Collectors.toSet())); // Not already found
if (additionalFilter != null) {
filter.AND(additionalFilter);
}
return filter;
}
/**
* Score the match against accession. Scoring should be transitive.
*
* @param a the Accession
* @param hit the Match
* @return
*/
@Override
protected double scoreHit(Accession a, Hit<Accession> hit) {
double score = hit.score;
var b = hit.result;
// Drop score a little if they're at the same institute
if (StringUtils.equalsIgnoreCase(a.getInstCode(), b.getInstCode())) {
score -= 400;
}
var aAccessionNumber = StringUtils.lowerCase(a.getAccessionNumber());
var aDonorNumb = StringUtils.lowerCase(a.getDonorNumb());
String aDonorCode = a.getDonorCode();
var bAccessionNumber = StringUtils.lowerCase(b.getAccessionNumber());
var bDonorNumb = StringUtils.lowerCase(b.getDonorNumb());
String bDonorCode = b.getDonorCode();
var matches = new ArrayList<String>();
if (notNullEquals(matches, aDonorCode, b.getInstCode())) {
score += 100;
}
if (notNullEquals(matches, bDonorCode, a.getInstCode())) {
score += 100;
}
if (notNullEquals(matches, a.getAccessionName(), b.getAccessionName())) {
score += 100;
} else {
score += similarityScore(matches, a.getAccessionName(), b.getAccessionName()) * 50;
score += stringsAndNumbersCompare(matches, a.getAccessionName(), b.getAccessionName()) * 50;
}
if (notNullEquals(matches, aDonorCode, bDonorCode)) {
score += 50;
}
if (notNullEquals(matches, a.getDonorName(), b.getDonorName())) {
score += 50;
} else {
score += similarityScore(matches, a.getDonorName(), b.getDonorName()) * 50;
}
if (notNullEquals(matches, aDonorNumb, bAccessionNumber)) {
score += 400;
} else {
score += stringsAndNumbersCompare(matches, aDonorNumb, bAccessionNumber) * 400;
}
if (notNullEquals(matches, bDonorNumb, aAccessionNumber)) {
score += 400;
} else {
score += stringsAndNumbersCompare(matches, bDonorNumb, aAccessionNumber) * 400;
}
// same donor number is good
if (notNullEquals(matches, aDonorNumb, bDonorNumb)) {
score += 200;
} else {
score += stringsAndNumbersCompare(matches, aDonorNumb, bDonorNumb) * 200;
}
// Country
if (notNullEquals(matches, a.getOrigCty(), b.getOrigCty())) {
score += 80;
}
{
/*
* Compare taxonomic data
*/
var at = a.getTaxonomy();
var bt = b.getTaxonomy();
if (notNullEquals(matches, at.getGenusSpecies(), bt.getGenusSpecies())) {
score += 200;
} else if (notNullEquals(matches, at.getGenus(), bt.getGenus())) {
score += 50;
}
score += similarityScore(matches, at.getSubtaxa(), bt.getSubtaxa()) * 50;
// Same current species
var ats = at.getCurrentTaxonomySpecies();
var bts = bt.getCurrentTaxonomySpecies();
if (ats != null && bts != null) {
if (ats.getId().equals(bts.getId())) {
score += 100;
}
}
}
{
/*
* Compare collecting data
*/
var ac = a.getAccessionId().getColl();
var bc = b.getAccessionId().getColl();
if (ac != null && bc != null) {
if (notNullEquals(matches, ac.getCollSite(), bc.getCollSite())) {
score += 200;
} else {
score += similarityScore(matches, ac.getCollSite(), bc.getCollSite()) * 100;
}
var acDate = ac.getCollDate();
var bcDate = bc.getCollDate();
if (acDate != null && bcDate != null) {
for (var i = Math.min(acDate.length(), bcDate.length()) - 1; i >= 0; i--) {
if (acDate.charAt(i) == bcDate.charAt(i)) {
score += 20;
}
}
}
if (notNullEquals(matches, ac.getCollNumb(), bc.getCollNumb())) {
score += 100;
} else {
score += stringsAndNumbersCompare(matches, ac.getCollNumb(), bc.getCollNumb()) * 100;
}
if (notNullEquals(matches, ac.getCollMissId(), bc.getCollMissId())) {
score += 20;
} else {
score += similarityScore(matches, ac.getCollMissId(), bc.getCollMissId()) * 10;
}
}
}
{
/*
* Compare coordinate data
*/
var ag = a.getAccessionId();
var bg = b.getAccessionId();
// TOTAL = 1030 + 100 + 100 = 1230
if (ag != null && bg != null) {
double diffLat = 100, diffLon = 100;
if (ag.getLatitude() != null && bg.getLatitude() != null) {
diffLat = Math.abs(ag.getLatitude() - bg.getLatitude());
}
if (ag.getLongitude() != null && bg.getLongitude() != null) {
diffLon = Math.abs(ag.getLongitude() - bg.getLongitude());
}
if (diffLat < 2 && diffLon < 2) {
var geoScore = ((2.0 - diffLat) / 2.0) * 100 + ((2.0 - diffLon) / 2.0) * 100;
if (geoScore > 150) {
matches.add("Coordinates");
}
score += geoScore;
}
if (ag.getElevation() != null && bg.getElevation() != null) {
if (Math.abs(ag.getElevation() - bg.getElevation()) < 100) {
matches.add("Elevation");
score += 50;
}
}
}
}
{
/*
* Check aliases
*/
var aa = a.getAccessionId().getAliases();
var ba = b.getAccessionId().getAliases();
if (aa != null && ba != null) {
List<String> bAliases = ba.stream()
// collecting and other numers
.filter((x) -> x.getAliasType() == AliasType.OTHERNUMB || x.getAliasType() == AliasType.COLLNUMB)
// only distinct names
.map((x) -> x.getName()).distinct().collect(Collectors.toList());
List<String> aAliases = aa.stream()
// collecting and other numers
.filter((x) -> x.getAliasType() == AliasType.OTHERNUMB || x.getAliasType() == AliasType.COLLNUMB)
// only distinct names
.map((x) -> x.getName()).distinct().collect(Collectors.toList());
score += compareAliases(matches, b, bAliases, aAliases) + compareAliases(matches, a, aAliases, bAliases);
}
}
hit.matches = matches.stream().distinct().collect(Collectors.toList());
hit.score = score;
return score;
}
/**
* Compare otherAliases to accessionAliases and to accession number, DOI and
* donorNumber.
*
* @param accession
* @param accessionAliases
* @param otherAliases
* @return the sum of comparison scores
*/
private double compareAliases(final Collection<String> matches, Accession accession, List<String> accessionAliases, List<String> otherAliases) {
if (otherAliases.size() == 0) {
return 0;
}
DoubleAdder da = new DoubleAdder();
otherAliases.forEach((balias) -> {
accessionAliases.forEach((aalias) -> {
if (notNullEquals(matches, aalias, balias)) {
da.add(100);
} else {
da.add(stringsAndNumbersCompare(matches, aalias, balias) * 80);
// da.add(similarityScore(aalias, balias) * 20);
}
});
// Check DOI
if (notNullEquals(matches, balias, accession.getDoi())) {
da.add(1000);
}
// Check accession number
if (notNullEquals(matches, balias, accession.getAccessionNumber())) {
da.add(400);
} else {
da.add(stringsAndNumbersCompare(matches, balias, accession.getAccessionNumber()) * 400);
}
// Check accession donor number
if (StringUtils.isNotBlank(accession.getDonorNumb())) {
if (notNullEquals(matches, balias, accession.getDonorNumb())) {
da.add(100);
} else {
da.add(stringsAndNumbersCompare(matches, balias, accession.getDonorNumb()) * 100);
}
}
// Accession name
String aAccessionName = accession.getAccessionName();
da.add(similarityScore(matches, balias, aAccessionName) * 50);
da.add(stringsAndNumbersCompare(matches, balias, aAccessionName) * 50);
});
return da.sum();
}
}