CitationDuplicateFinder.java
/*
* Copyright 2024 Global Crop Diversity Trust
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.genesys.server.service.worker.dupe;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.server.exception.SearchException;
import org.genesys.server.model.PublishState;
import org.genesys.server.model.bib.Citation;
import org.genesys.server.model.filters.CitationFilter;
import org.genesys.server.service.CitationService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@Component
public class CitationDuplicateFinder extends DuplicateFinder<Citation, CitationFilter> {
@Autowired
private CitationService citationService;
@Override
protected double getBestScoreThreshold() {
return 1000d;
}
@Override
protected List<Citation> getCandidates(Citation target, Collection<Long> excludedById, CitationFilter additionalFilter) {
List<Citation> candidates = new ArrayList<>(100);
if (StringUtils.isNotBlank(target.getTitle()) || StringUtils.isNotBlank(target.getAbstractText())) {
try {
CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
filter._text(
bigWords(toSafeEsQuery(StringUtils.defaultIfBlank(target.getTitle(), "")))
// .concat(StringUtils.defaultIfBlank(target.getAuthorName(), "")).concat(" ")
);
LOG.info("Filtering for {}", filter);
var matches = citationService.list(filter, PageRequest.of(0, 50));
candidates.addAll(matches.getContent());
} catch (SearchException e) {
LOG.warn(e.getMessage());
}
}
// if (StringUtils.isNotBlank(target.getPublication())) {
// try {
// CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
// filter.journal(new StringFilter().eq(Set.of(target.getPublication())));
// LOG.info("Filtering for {}", filter);
// var matches = citationService.list(filter, PageRequest.of(0, 50));
// candidates.addAll(matches.getContent());
// } catch (SearchException e) {
// LOG.warn(e.getMessage());
// }
// }
// if (StringUtils.isNotBlank(target.getAuthorName())) {
// try {
// CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
// filter.authorName(new StringFilter().eq(Set.of(target.getAuthorName())));
// LOG.info("Filtering for {}", filter);
// var matches = citationService.list(filter, PageRequest.of(0, 50));
// candidates.addAll(matches.getContent());
// } catch (SearchException e) {
// LOG.warn(e.getMessage());
// }
// }
return candidates;
}
private CitationFilter getCandidatesFilter(Citation target, Collection<Long> excludedById, List<Citation> candidates, CitationFilter additionalFilter) {
CitationFilter filter = new CitationFilter();
filter.state(Set.of(PublishState.PUBLISHED));
if (target.getId() != null || !CollectionUtils.isEmpty(excludedById) || !candidates.isEmpty()) {
filter.NOT(new CitationFilter());
filter.NOT.id(new HashSet<>());
if (target.getId() != null) {
filter.NOT.id().add(target.getId());
}
if (!CollectionUtils.isEmpty(excludedById)) {
filter.NOT.id().addAll(excludedById);
}
filter.NOT.id().addAll(candidates.stream().map(Citation::getId).collect(Collectors.toSet())); // Not already found
}
if (additionalFilter != null) {
filter.AND(additionalFilter);
}
return filter;
}
@Override
protected double scoreHit(Citation a, Hit<Citation> hit) {
double score = hit.score;
var b = hit.result;
var matches = new ArrayList<String>();
if (notNullEquals(matches, a.getTitle(), b.getTitle())) {
score += 1000;
} else {
score += similarityScore(matches, a.getTitle(), b.getTitle()) * 500;
score += stringsAndNumbersCompare(matches, a.getTitle(), b.getTitle()) * 500;
}
if (notNullEquals(matches, a.getPublication(), b.getPublication())) {
score += 200;
} else {
score += similarityScore(matches, a.getPublication(), b.getPublication()) * 50;
score += stringsAndNumbersCompare(matches, a.getPublication(), b.getPublication()) * 50;
}
if (notNullEquals(matches, a.getPublicationYear(), b.getPublicationYear())) {
score += 100;
}
// if (notNullEquals(matches, a.getAuthorName(), b.getAuthorName())) {
// score += 300;
// }
// if (notNullEquals(matches, a.getAbstractText(), b.getAbstractText())) {
// score += 1000;
// } else {
// score += similarityScore(matches, a.getAbstractText(), b.getAbstractText()) * 200;
// score += stringsAndNumbersCompare(matches, a.getAbstractText(), b.getAbstractText()) * 200;
// }
hit.matches = matches.stream().distinct().collect(Collectors.toList());
hit.score = score;
return score;
}
private String bigWords(String text) {
if (text == null) {
return null;
}
var kw = text.split("\\s+");
var uKw = new ArrayList<>(Arrays.stream(kw).map(w -> w.replaceAll("[^\\p{L}]", "")).filter(w -> w.length() > 2).collect(Collectors.toSet()));
// uKw.sort((a, b) -> -Integer.compare(a.length(), b.length()));
LOG.trace("Unique keywords for _text: {} = {}", text, uKw);
return uKw.stream().collect(Collectors.joining(" "));
}
}