DescriptorDuplicateFinder.java

/*
 * Copyright 2021 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.genesys.server.service.worker.dupe;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import org.genesys.server.component.security.SecurityUtils;
import org.genesys.server.exception.SearchException;
import org.genesys.server.model.Partner;
import org.genesys.server.model.PublishState;
import org.genesys.server.model.UserRole;
import org.genesys.server.model.filters.DescriptorFilter;
import org.genesys.server.model.traits.Descriptor;
import org.genesys.server.model.vocab.VocabularyTerm;
import org.genesys.server.service.DescriptorService;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.security.acls.domain.BasePermission;
import org.springframework.stereotype.Component;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.hibernate.Hibernate;


/**
 * Find potentially duplicate Descriptors
 * 
 * @author Matija Obreza
 */
@Component
public class DescriptorDuplicateFinder extends DuplicateFinder<Descriptor, DescriptorFilter> {

	@Autowired
	private DescriptorService descriptorService;
	@Autowired
	private SecurityUtils securityUtils;

	@Override
	protected double getBestScoreThreshold() {
		return 800d;
	}

	@Override
	protected List<Descriptor> getCandidates(Descriptor target, Collection<Long> excludedById, DescriptorFilter additionalFilter) {
		assert (target != null);
		LOG.info("Searching for duplicates of {}", target.toString());

		var descriptorFilter = new DescriptorFilter();

		if (additionalFilter != null) {
			descriptorFilter.AND(additionalFilter);
		}

		if (securityUtils.hasRole(UserRole.ADMINISTRATOR)) {
			// No constraints
		} else {
			var root = descriptorFilter.AND;
			if (root == null) {
				root = new DescriptorFilter();
			} else {
				root = descriptorFilter.AND(new DescriptorFilter()).AND;
			}
			var mine = securityUtils.listObjectIdentityIdsForCurrentUser(Partner.class, BasePermission.WRITE);
			if (mine.size() > 0) {
				root.owner().id(new HashSet<>(mine));
				root.OR(new DescriptorFilter())
					.OR().state(PublishState.PUBLISHED);
			} else {
				descriptorFilter.state(PublishState.PUBLISHED);
			}
		}

		if (! CollectionUtils.isEmpty(excludedById)) {
			descriptorFilter.NOT(new DescriptorFilter());
			descriptorFilter.NOT().id(new HashSet<>(excludedById));
		}

		List<Descriptor> candidates = new ArrayList<>(100);

		if (StringUtils.isNotBlank(target.getTitle()) || StringUtils.isNotBlank(target.getColumnName())) {
			try {
				DescriptorFilter filter = new DescriptorFilter();
				filter.AND(descriptorFilter);
				filter._text(
					bigWords(toSafeEsQuery(List.of(StringUtils.defaultIfBlank(target.getTitle(), ""), StringUtils.defaultIfBlank(target.getColumnName(), "")).stream().filter(s -> StringUtils.trimToNull(s) != null).collect(Collectors.joining(" "))))
				);

				LOG.info("Filtering for {}", filter.toString());
				var matches = descriptorService.listDescriptors(filter, PageRequest.of(0, 50));
				candidates.addAll(matches.getContent());
			} catch (SearchException e) {
				LOG.warn(e.getMessage());
			}
		}

		if (StringUtils.isNotBlank(target.getColumnName())) {
			try {
				DescriptorFilter filter = (DescriptorFilter) new DescriptorFilter().AND(descriptorFilter);
				filter.columnName(Set.of(target.getColumnName()));

				LOG.info("Filtering for {}", filter.toString());
				var matches = descriptorService.listDescriptors(filter, PageRequest.of(0, 50));
				candidates.addAll(matches.getContent());
			} catch (SearchException e) {
				LOG.warn(e.getMessage());
			}
		}

		return candidates;
	}

	/**
	 * Score hit.
	 *
	 * @param target the target
	 * @param hit the hit
	 * @return the double
	 */
	@Override
	protected double scoreHit(Descriptor target, Hit<Descriptor> hit) {
		var candidate = hit.result;
		var score = hit.score;

		if (candidate.isPublished()) {
			score += 50;
			hit.matches.add("Published");
		}
		if (candidate.getDatasets().size() > 0) {
			score += 50;
			hit.matches.add("Datasets");
		}
		if (candidate.getDescriptorLists().size() > 0) {
			score += 50;
			hit.matches.add("Lists");
		}
		score += similarityScore(hit.matches, target.getColumnName(), candidate.getColumnName()) * 100;
		score += similarityScore(hit.matches, target.getColumnName(), candidate.getTitle()) * 50;
		score += similarityScore(hit.matches, target.getTitle(), candidate.getColumnName()) * 50;
		score += similarityScore(hit.matches, target.getTitle(), candidate.getTitle()) * 300;
		score += similarityScore(hit.matches, target.getDescription(), candidate.getDescription()) * 200;
		score += similarityScore(hit.matches, target.getCrop(), candidate.getCrop()) * 200;

		if (notNullEquals(hit.matches, target.getCategory(), candidate.getCategory())) {
			score += 10;
		}
		if (notNullEquals(hit.matches, target.getCrop(), candidate.getCrop())) {
			score += 100;
		}
		if (notNullEquals(hit.matches, target.getColumnName(), candidate.getColumnName())) {
			score += 100;
		}
		if (notNullEquals(hit.matches, target.getUom(), candidate.getUom())) {
			score += 50;
		}

		// Match terms
		if (! Hibernate.isInitialized(target.getTerms())) {
			// Can't do much

		} else if (CollectionUtils.isNotEmpty(target.getTerms()) && CollectionUtils.isNotEmpty(candidate.getTerms())) {

			// Compare codes
			Set<String> targetCodes = target.getTerms().stream().map(VocabularyTerm::getCode).collect(Collectors.toSet());
			Set<String> candidateCodes = candidate.getTerms().stream().map(VocabularyTerm::getCode).collect(Collectors.toSet());

			if (candidateCodes.size() > 0 && targetCodes.size() > 0 && candidateCodes.containsAll(targetCodes)) {
				score += 200;
				hit.matches.add("Codes");
			} else {
				// Compare codes
				score += compareStrings(hit.matches, 20, targetCodes, candidateCodes);

				// Compare term titles
				score += compareStrings(hit.matches, 10,
					target.getTerms().stream().map(VocabularyTerm::getTitle).collect(Collectors.toSet()),
					candidate.getTerms().stream().map(VocabularyTerm::getTitle).collect(Collectors.toSet()));
			}
		} else if (CollectionUtils.isNotEmpty(target.getTerms()) || CollectionUtils.isNotEmpty(candidate.getTerms())) {
			// One has terms!
			score -= 50;
		} else {
			// Neither have terms
			score += 100;
		}

		hit.score = score;
		return score;
	}

	private String bigWords(String text) {
		if (text == null) {
			return null;
		}
		var kw = text.split("\\s+");
		var uKw = new ArrayList<>(Arrays.stream(kw).map(w -> w.replaceAll("[^\\p{L}]", "")).filter(w -> w.length() > 2).collect(Collectors.toSet()));
		uKw.sort((a, b) -> -Integer.compare(a.length(), b.length()));
		LOG.info("Unique keywords for _text: {} = {}", text, uKw);
		return uKw.stream().collect(Collectors.joining(" "));
	}
}