AccessionDuplicateFinder.java

/*
 * Copyright 2021 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.genesys.server.service.worker.dupe;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.DoubleAdder;
import java.util.stream.Collectors;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.blocks.model.filters.NumberFilter;
import org.genesys.server.model.genesys.Accession;
import org.genesys.server.model.genesys.Taxonomy2;
import org.genesys.server.model.genesys.AccessionAlias.AliasType;
import org.genesys.server.service.AccessionService;
import org.genesys.server.service.filter.AccessionFilter;
import org.genesys.server.exception.SearchException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

/**
 * Accession Duplicate Finder.
 */
@Component
public class AccessionDuplicateFinder extends DuplicateFinder<Accession, AccessionFilter> {

	@Autowired
	private AccessionService accessionService;

	@Override
	protected double getBestScoreThreshold() {
		return 1000d;
	}

	@Override
	protected List<Accession> getCandidates(Accession target, Collection<Long> excludedById, AccessionFilter additionalFilter) {
		assert (target != null);
		LOG.info("Searching for duplicates of {}", target.toString());

		List<Accession> candidates = new ArrayList<>(100);

		Taxonomy2 taxonomy = target.getTaxonomy();
		Set<String> genusSet = new HashSet<>();
		genusSet.add(taxonomy.getGenus());

		if (taxonomy.getCurrentTaxonomySpecies() != null) {
			genusSet.add(taxonomy.getCurrentTaxonomySpecies().getTaxonomyGenus().getName());
		}
		if (taxonomy.getGrinTaxonomySpecies() != null) {
			genusSet.add(taxonomy.getGrinTaxonomySpecies().getTaxonomyGenus().getName());
		}

		// By donor
		String aDonorNumb = target.getDonorNumb();
		if (StringUtils.isNotBlank(aDonorNumb)) {
			var aDonorNumbSplit = spaceStringsAndNumbers(aDonorNumb);

			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);

			// if (StringUtils.isNotBlank(accession.getDonorCode())) {
			// filter.institute().code = Set.of(accession.getDonorCode());
			// }

			// By donor accession number
			if (StringUtils.isNotBlank(aDonorNumb)) {
				filter.accessionNumbers().add(aDonorNumb);
				filter.accessionNumbers().add(aDonorNumbSplit);

				try {
					LOG.info("Filtering for {}", filter.toString());
					Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 10));
					candidates.addAll(matches.getContent());
				} catch (SearchException e) {
					LOG.warn(e.getMessage());
				}
			}

			// By donor accession number
			if (StringUtils.isNotBlank(aDonorNumbSplit)) {
				filter.accessionNumbers().clear();
				filter
					.institute(null)
					._text(toSafeEsQuery(aDonorNumbSplit + " " + StringUtils.defaultIfBlank(target.getDonorCode(), "")));
				try {
					LOG.info("Filtering for donor number {}", filter.toString());
					Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
					candidates.addAll(matches.getContent());
				} catch (SearchException e) {
					LOG.warn(e.getMessage());
				}
			}
		}

		// By genus and accession name
		if (StringUtils.isNotBlank(target.getAccessionName())) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);

			filter.taxa().genus(genusSet);
			// filter.taxa().species = Set.of(accession.getTaxonomy().getSpecies());

			if (StringUtils.isNotBlank(target.getAccessionName())) {
				filter._text(toStringsAndNumbers(target.getAccessionName()).stream()
					// proximity search for strings
					.map((part) -> {
						if (part instanceof Number) {
							return part.toString();
						}
						return toSafeEsQuery(part.toString()) + "~";
					})
					// join
					.collect(Collectors.joining(" ")))
				;

				try {
					LOG.info("Filtering for accession name {}", filter.toString());
					Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
					candidates.addAll(matches.getContent());
				} catch (SearchException e) {
					LOG.warn(e.getMessage());
				}
			}
		}

		{
			// By genus and other IDs
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);

			filter.taxa().genus(genusSet);

			if (target.getAccessionId().getAliases().size() > 0) {
				// filter.taxa().species = Set.of(accession.getTaxonomy().getSpecies());

				filter._text(target.getAccessionId().getAliases().stream()
					// Remove
					// .filter((alias) -> alias.getAliasType() == AliasType.OTHERNUMB)
					// Just names
					.map((alias) -> alias.getName())
					// remove short stuff
					.filter((name) -> name != null && name.length() > 3)
					// Strings and numbers
					.map((name) -> spaceStringsAndNumbers(name))
					// cleanup
					.map((name) -> toSafeEsQuery(name))
					// ES search string
					.collect(Collectors.joining(" | ")))
				;

				filter._text(filter._text += " | " + toSafeEsQuery(target.getAccessionNumber()));

			} else {
				filter._text(target.getAccessionNumber());
			}

			if (StringUtils.isNotBlank(filter._text)) {
				try {
					LOG.info("Filtering for aliases {}", filter.toString());
					Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
					candidates.addAll(matches.getContent());

				} catch (SearchException e) {
					LOG.warn(e.getMessage());
				}
			}
		}

		// By genus and coordinates
		if (target.getAccessionId().getLatitude() != null && target.getAccessionId().getLongitude() != null) {
			AccessionFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);

			filter.taxa().genus(Set.of(taxonomy.getGenus()));

			float geoFact = 1.01f; // 1%
			filter.geo()
				.latitude(new NumberFilter<Double>(target.getAccessionId().getLatitude() / geoFact, target.getAccessionId().getLatitude() * geoFact))
				.longitude(new NumberFilter<Double>(target.getAccessionId().getLongitude() / geoFact, target.getAccessionId().getLongitude() * geoFact));

			try {
				LOG.info("Filtering for {}", filter.toString());
				Page<Accession> matches = accessionService.list(filter, PageRequest.of(0, 20));
				candidates.addAll(matches.getContent());
			} catch (SearchException e) {
				LOG.warn(e.getMessage());
			}
		}

		return candidates;
	}

	private AccessionFilter getCandidatesFilter(Accession target, Collection<Long> excludedById, List<Accession> candidates, AccessionFilter additionalFilter) {
		AccessionFilter filter = new AccessionFilter(null);
		filter.NOT(new AccessionFilter(null));
		filter.NOT.id(new HashSet<>());
		filter.NOT.id().add(target.getId());
		if (! CollectionUtils.isEmpty(excludedById)) {
			filter.NOT.id().addAll(excludedById);
		}
		filter.NOT.id().addAll(candidates.stream().map(Accession::getId).collect(Collectors.toSet())); // Not already found
		if (additionalFilter != null) {
			filter.AND(additionalFilter);
		}
		return filter;
	}

	/**
	 * Score the match against accession. Scoring should be transitive.
	 *
	 * @param a the Accession
	 * @param hit the Match
	 * @return
	 */
	@Override
	protected double scoreHit(Accession a, Hit<Accession> hit) {
		double score = hit.score;
		var b = hit.result;

		// Drop score a little if they're at the same institute
		if (StringUtils.equalsIgnoreCase(a.getInstCode(), b.getInstCode())) {
			score -= 400;
		}

		var aAccessionNumber = StringUtils.lowerCase(a.getAccessionNumber());
		var aDonorNumb = StringUtils.lowerCase(a.getDonorNumb());
		String aDonorCode = a.getDonorCode();

		var bAccessionNumber = StringUtils.lowerCase(b.getAccessionNumber());
		var bDonorNumb = StringUtils.lowerCase(b.getDonorNumb());
		String bDonorCode = b.getDonorCode();

		var matches = new ArrayList<String>();

		if (notNullEquals(matches, aDonorCode, b.getInstCode())) {
			score += 100;
		}
		if (notNullEquals(matches, bDonorCode, a.getInstCode())) {
			score += 100;
		}

		if (notNullEquals(matches, a.getAccessionName(), b.getAccessionName())) {
			score += 100;
		} else {
			score += similarityScore(matches, a.getAccessionName(), b.getAccessionName()) * 50;
			score += stringsAndNumbersCompare(matches, a.getAccessionName(), b.getAccessionName()) * 50;
		}

		if (notNullEquals(matches, aDonorCode, bDonorCode)) {
			score += 50;
		}

		if (notNullEquals(matches, a.getDonorName(), b.getDonorName())) {
			score += 50;
		} else {
			score += similarityScore(matches, a.getDonorName(), b.getDonorName()) * 50;
		}

		if (notNullEquals(matches, aDonorNumb, bAccessionNumber)) {
			score += 400;
		} else {
			score += stringsAndNumbersCompare(matches, aDonorNumb, bAccessionNumber) * 400;
		}

		if (notNullEquals(matches, bDonorNumb, aAccessionNumber)) {
			score += 400;
		} else {
			score += stringsAndNumbersCompare(matches, bDonorNumb, aAccessionNumber) * 400;
		}

		// same donor number is good
		if (notNullEquals(matches, aDonorNumb, bDonorNumb)) {
			score += 200;
		} else {
			score += stringsAndNumbersCompare(matches, aDonorNumb, bDonorNumb) * 200;
		}

		// Country
		if (notNullEquals(matches, a.getOrigCty(), b.getOrigCty())) {
			score += 80;
		}

		{
			/*
			 * Compare taxonomic data
			 */
			var at = a.getTaxonomy();
			var bt = b.getTaxonomy();

			if (notNullEquals(matches, at.getGenusSpecies(), bt.getGenusSpecies())) {
				score += 200;
			} else if (notNullEquals(matches, at.getGenus(), bt.getGenus())) {
				score += 50;
			}
			score += similarityScore(matches, at.getSubtaxa(), bt.getSubtaxa()) * 50;

			// Same current species
			var ats = at.getCurrentTaxonomySpecies();
			var bts = bt.getCurrentTaxonomySpecies();
			if (ats != null && bts != null) {
				if (ats.getId().equals(bts.getId())) {
					score += 100;
				}
			}
		}

		{
			/*
			 * Compare collecting data
			 */
			var ac = a.getAccessionId().getColl();
			var bc = b.getAccessionId().getColl();

			if (ac != null && bc != null) {
				if (notNullEquals(matches, ac.getCollSite(), bc.getCollSite())) {
					score += 200;
				} else {
					score += similarityScore(matches, ac.getCollSite(), bc.getCollSite()) * 100;
				}

				var acDate = ac.getCollDate();
				var bcDate = bc.getCollDate();
				if (acDate != null && bcDate != null) {
					for (var i = Math.min(acDate.length(), bcDate.length()) - 1; i >= 0; i--) {
						if (acDate.charAt(i) == bcDate.charAt(i)) {
							score += 20;
						}
					}
				}
				if (notNullEquals(matches, ac.getCollNumb(), bc.getCollNumb())) {
					score += 100;
				} else {
					score += stringsAndNumbersCompare(matches, ac.getCollNumb(), bc.getCollNumb()) * 100;
				}
				if (notNullEquals(matches, ac.getCollMissId(), bc.getCollMissId())) {
					score += 20;
				} else {
					score += similarityScore(matches, ac.getCollMissId(), bc.getCollMissId()) * 10;
				}
			}
		}

		{
			/*
			 * Compare coordinate data
			 */
			var ag = a.getAccessionId();
			var bg = b.getAccessionId();
			// TOTAL = 1030 + 100 + 100 = 1230
			if (ag != null && bg != null) {
				double diffLat = 100, diffLon = 100;
				if (ag.getLatitude() != null && bg.getLatitude() != null) {
					diffLat = Math.abs(ag.getLatitude() - bg.getLatitude());
				}
				if (ag.getLongitude() != null && bg.getLongitude() != null) {
					diffLon = Math.abs(ag.getLongitude() - bg.getLongitude());
				}
				if (diffLat < 2 && diffLon < 2) {
					var geoScore = ((2.0 - diffLat) / 2.0) * 100 + ((2.0 - diffLon) / 2.0) * 100;
					if (geoScore > 150) {
						matches.add("Coordinates");
					}
					score += geoScore;
				}

				if (ag.getElevation() != null && bg.getElevation() != null) {
					if (Math.abs(ag.getElevation() - bg.getElevation()) < 100) {
						matches.add("Elevation");
						score += 50;
					}
				}
			}
		}

		{
			/*
			 * Check aliases
			 */
			var aa = a.getAccessionId().getAliases();
			var ba = b.getAccessionId().getAliases();

			if (aa != null && ba != null) {
				List<String> bAliases = ba.stream()
					// collecting and other numers
					.filter((x) -> x.getAliasType() == AliasType.OTHERNUMB || x.getAliasType() == AliasType.COLLNUMB)
					// only distinct names
					.map((x) -> x.getName()).distinct().collect(Collectors.toList());
				List<String> aAliases = aa.stream()
					// collecting and other numers
					.filter((x) -> x.getAliasType() == AliasType.OTHERNUMB || x.getAliasType() == AliasType.COLLNUMB)
					// only distinct names
					.map((x) -> x.getName()).distinct().collect(Collectors.toList());
				score += compareAliases(matches, b, bAliases, aAliases) + compareAliases(matches, a, aAliases, bAliases);
			}
		}

		hit.matches = matches.stream().distinct().collect(Collectors.toList());
		hit.score = score;
		return score;
	}

	/**
	 * Compare otherAliases to accessionAliases and to accession number, DOI and
	 * donorNumber.
	 * 
	 * @param accession
	 * @param accessionAliases
	 * @param otherAliases
	 * @return the sum of comparison scores
	 */
	private double compareAliases(final Collection<String> matches, Accession accession, List<String> accessionAliases, List<String> otherAliases) {
		if (otherAliases.size() == 0) {
			return 0;
		}

		DoubleAdder da = new DoubleAdder();
		otherAliases.forEach((balias) -> {
			accessionAliases.forEach((aalias) -> {
				if (notNullEquals(matches, aalias, balias)) {
					da.add(100);
				} else {
					da.add(stringsAndNumbersCompare(matches, aalias, balias) * 80);
					// da.add(similarityScore(aalias, balias) * 20);
				}
			});
			// Check DOI
			if (notNullEquals(matches, balias, accession.getDoi())) {
				da.add(1000);
			}
			// Check accession number
			if (notNullEquals(matches, balias, accession.getAccessionNumber())) {
				da.add(400);
			} else {
				da.add(stringsAndNumbersCompare(matches, balias, accession.getAccessionNumber()) * 400);
			}
			// Check accession donor number
			if (StringUtils.isNotBlank(accession.getDonorNumb())) {
				if (notNullEquals(matches, balias, accession.getDonorNumb())) {
					da.add(100);
				} else {
					da.add(stringsAndNumbersCompare(matches, balias, accession.getDonorNumb()) * 100);
				}
			}
			// Accession name
			String aAccessionName = accession.getAccessionName();
			da.add(similarityScore(matches, balias, aAccessionName) * 50);
			da.add(stringsAndNumbersCompare(matches, balias, aAccessionName) * 50);
		});
		return da.sum();
	}

}