DescriptorDuplicateFinder.java
/*
* Copyright 2021 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.server.service.worker.dupe;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.server.component.security.SecurityUtils;
import org.genesys.server.model.Partner;
import org.genesys.server.model.PublishState;
import org.genesys.server.model.UserRole;
import org.genesys.server.model.filters.DescriptorFilter;
import org.genesys.server.model.traits.Descriptor;
import org.genesys.server.model.vocab.VocabularyTerm;
import org.genesys.server.service.DescriptorService;
import org.genesys.server.exception.SearchException;
import org.hibernate.Hibernate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.security.acls.domain.BasePermission;
import org.springframework.stereotype.Component;
/**
* Find potentially duplicate Descriptors
*
* @author Matija Obreza
*/
@Component
public class DescriptorDuplicateFinder extends DuplicateFinder<Descriptor, DescriptorFilter> {
@Autowired
private DescriptorService descriptorService;
@Autowired
private SecurityUtils securityUtils;
@Override
protected double getBestScoreThreshold() {
return 800d;
}
@Override
protected List<Descriptor> getCandidates(Descriptor target, Collection<Long> excludedById, DescriptorFilter additionalFilter) {
assert (target != null);
LOG.info("Searching for duplicates of {}", target.toString());
var descriptorFilter = new DescriptorFilter();
if (additionalFilter != null) {
descriptorFilter.AND(additionalFilter);
}
if (securityUtils.hasRole(UserRole.ADMINISTRATOR)) {
// No constraints
} else {
var root = descriptorFilter.AND;
if (root == null) {
root = new DescriptorFilter();
} else {
root = descriptorFilter.AND(new DescriptorFilter()).AND;
}
var mine = securityUtils.listObjectIdentityIdsForCurrentUser(Partner.class, BasePermission.WRITE);
if (mine.size() > 0) {
root.owner().id(new HashSet<>(mine));
root.OR(new DescriptorFilter())
.OR().state(PublishState.PUBLISHED);
} else {
descriptorFilter.state(PublishState.PUBLISHED);
}
}
if (! CollectionUtils.isEmpty(excludedById)) {
descriptorFilter.NOT(new DescriptorFilter());
descriptorFilter.NOT().id(new HashSet<>(excludedById));
}
List<Descriptor> candidates = new ArrayList<>(100);
if (StringUtils.isNotBlank(target.getTitle()) || StringUtils.isNotBlank(target.getColumnName())) {
try {
DescriptorFilter filter = new DescriptorFilter();
filter.AND(descriptorFilter);
filter._text(
bigWords(toSafeEsQuery(List.of(StringUtils.defaultIfBlank(target.getTitle(), ""), StringUtils.defaultIfBlank(target.getColumnName(), "")).stream().filter(s -> StringUtils.trimToNull(s) != null).collect(Collectors.joining(" "))))
);
LOG.info("Filtering for {}", filter.toString());
var matches = descriptorService.listDescriptors(filter, PageRequest.of(0, 50));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
if (StringUtils.isNotBlank(target.getColumnName())) {
try {
DescriptorFilter filter = (DescriptorFilter) new DescriptorFilter().AND(descriptorFilter);
filter.columnName(Set.of(target.getColumnName()));
LOG.info("Filtering for {}", filter.toString());
var matches = descriptorService.listDescriptors(filter, PageRequest.of(0, 50));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
return candidates;
}
/**
* Score hit.
*
* @param target the target
* @param hit the hit
* @return the double
*/
@Override
protected double scoreHit(Descriptor target, Hit<Descriptor> hit) {
var candidate = hit.result;
var score = hit.score;
if (candidate.isPublished()) {
score += 50;
hit.matches.add("Published");
}
if (candidate.getDatasets().size() > 0) {
score += 50;
hit.matches.add("Datasets");
}
if (candidate.getDescriptorLists().size() > 0) {
score += 50;
hit.matches.add("Lists");
}
score += similarityScore(hit.matches, target.getColumnName(), candidate.getColumnName()) * 100;
score += similarityScore(hit.matches, target.getColumnName(), candidate.getTitle()) * 50;
score += similarityScore(hit.matches, target.getTitle(), candidate.getColumnName()) * 50;
score += similarityScore(hit.matches, target.getTitle(), candidate.getTitle()) * 300;
score += similarityScore(hit.matches, target.getDescription(), candidate.getDescription()) * 200;
score += similarityScore(hit.matches, target.getCrop(), candidate.getCrop()) * 200;
if (notNullEquals(hit.matches, target.getCategory(), candidate.getCategory())) {
score += 10;
}
if (notNullEquals(hit.matches, target.getCrop(), candidate.getCrop())) {
score += 100;
}
if (notNullEquals(hit.matches, target.getColumnName(), candidate.getColumnName())) {
score += 100;
}
if (notNullEquals(hit.matches, target.getUom(), candidate.getUom())) {
score += 50;
}
// Match terms
if (! Hibernate.isInitialized(target.getTerms())) {
// Can't do much
} else if (CollectionUtils.isNotEmpty(target.getTerms()) && CollectionUtils.isNotEmpty(candidate.getTerms())) {
// Compare codes
Set<String> targetCodes = target.getTerms().stream().map(VocabularyTerm::getCode).collect(Collectors.toSet());
Set<String> candidateCodes = candidate.getTerms().stream().map(VocabularyTerm::getCode).collect(Collectors.toSet());
if (candidateCodes.size() > 0 && targetCodes.size() > 0 && candidateCodes.containsAll(targetCodes)) {
score += 200;
hit.matches.add("Codes");
} else {
// Compare codes
score += compareStrings(hit.matches, 20, targetCodes, candidateCodes);
// Compare term titles
score += compareStrings(hit.matches, 10,
target.getTerms().stream().map(VocabularyTerm::getTitle).collect(Collectors.toSet()),
candidate.getTerms().stream().map(VocabularyTerm::getTitle).collect(Collectors.toSet()));
}
} else if (CollectionUtils.isNotEmpty(target.getTerms()) || CollectionUtils.isNotEmpty(candidate.getTerms())) {
// One has terms!
score -= 50;
} else {
// Neither have terms
score += 100;
}
hit.score = score;
return score;
}
private String bigWords(String text) {
if (text == null) {
return null;
}
var kw = text.split("\\s+");
var uKw = new ArrayList<>(Arrays.stream(kw).map(w -> w.replaceAll("[^\\p{L}]", "")).filter(w -> w.length() > 2).collect(Collectors.toSet()));
uKw.sort((a, b) -> -Integer.compare(a.length(), b.length()));
LOG.info("Unique keywords for _text: {} = {}", text, uKw);
return uKw.stream().collect(Collectors.joining(" "));
}
}