CitationDuplicateFinder.java

/*
 * Copyright 2024 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.genesys.server.service.worker.dupe;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.genesys.server.exception.SearchException;
import org.genesys.server.model.PublishState;
import org.genesys.server.model.bib.Citation;
import org.genesys.server.model.filters.CitationFilter;
import org.genesys.server.service.CitationService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

@Component
public class CitationDuplicateFinder extends DuplicateFinder<Citation, CitationFilter> {
	
	@Autowired
	private CitationService citationService;
	
	@Override
	protected double getBestScoreThreshold() {
		return 1000d;
	}

	@Override
	protected List<Citation> getCandidates(Citation target, Collection<Long> excludedById, CitationFilter additionalFilter) {

		List<Citation> candidates = new ArrayList<>(100);
		
		if (StringUtils.isNotBlank(target.getTitle()) || StringUtils.isNotBlank(target.getAbstractText())) {
			try {
				CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
				filter._text(
					bigWords(toSafeEsQuery(StringUtils.defaultIfBlank(target.getTitle(), "")))
					// .concat(StringUtils.defaultIfBlank(target.getAuthorName(), "")).concat(" ")
				);

				LOG.info("Filtering for {}", filter);
				var matches = citationService.list(filter, PageRequest.of(0, 50));
				candidates.addAll(matches.getContent());
			} catch (SearchException e) {
				LOG.warn(e.getMessage());
			}
		}

		// if (StringUtils.isNotBlank(target.getPublication())) {
		// 	try {
		// 		CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
		// 		filter.journal(new StringFilter().eq(Set.of(target.getPublication())));

		// 		LOG.info("Filtering for {}", filter);
		// 		var matches = citationService.list(filter, PageRequest.of(0, 50));
		// 		candidates.addAll(matches.getContent());
		// 	} catch (SearchException e) {
		// 		LOG.warn(e.getMessage());
		// 	}
		// }

		// if (StringUtils.isNotBlank(target.getAuthorName())) {
		// 	try {
		// 		CitationFilter filter = getCandidatesFilter(target, excludedById, candidates, additionalFilter);
		// 		filter.authorName(new StringFilter().eq(Set.of(target.getAuthorName())));

		// 		LOG.info("Filtering for {}", filter);
		// 		var matches = citationService.list(filter, PageRequest.of(0, 50));
		// 		candidates.addAll(matches.getContent());
		// 	} catch (SearchException e) {
		// 		LOG.warn(e.getMessage());
		// 	}
		// }
		
		return candidates;
	}

	private CitationFilter getCandidatesFilter(Citation target, Collection<Long> excludedById, List<Citation> candidates, CitationFilter additionalFilter) {
		CitationFilter filter = new CitationFilter();
		filter.state(Set.of(PublishState.PUBLISHED));
		if (target.getId() != null || !CollectionUtils.isEmpty(excludedById) || !candidates.isEmpty()) {
			filter.NOT(new CitationFilter());
			filter.NOT.id(new HashSet<>());
			if (target.getId() != null) {
				filter.NOT.id().add(target.getId());
			}
			if (!CollectionUtils.isEmpty(excludedById)) {
				filter.NOT.id().addAll(excludedById);
			}
			filter.NOT.id().addAll(candidates.stream().map(Citation::getId).collect(Collectors.toSet())); // Not already found
		}
		if (additionalFilter != null) {
			filter.AND(additionalFilter);
		}
		return filter;
	}

	@Override
	protected double scoreHit(Citation a, Hit<Citation> hit) {
		double score = hit.score;
		var b = hit.result;

		var matches = new ArrayList<String>();
		
		if (notNullEquals(matches, a.getTitle(), b.getTitle())) {
			score += 1000;
		} else {
			score += similarityScore(matches, a.getTitle(), b.getTitle()) * 500;
			score += stringsAndNumbersCompare(matches, a.getTitle(), b.getTitle()) * 500;
		}

		if (notNullEquals(matches, a.getPublication(), b.getPublication())) {
			score += 200;
		} else {
			score += similarityScore(matches, a.getPublication(), b.getPublication()) * 50;
			score += stringsAndNumbersCompare(matches, a.getPublication(), b.getPublication()) * 50;
		}
		
		if (notNullEquals(matches, a.getPublicationYear(), b.getPublicationYear())) {
			score += 100;
		}

		// if (notNullEquals(matches, a.getAuthorName(), b.getAuthorName())) {
		// 	score += 300;
		// }

		// if (notNullEquals(matches, a.getAbstractText(), b.getAbstractText())) {
		// 	score += 1000;
		// } else {
		// 	score += similarityScore(matches, a.getAbstractText(), b.getAbstractText()) * 200;
		// 	score += stringsAndNumbersCompare(matches, a.getAbstractText(), b.getAbstractText()) * 200;
		// }

		hit.matches = matches.stream().distinct().collect(Collectors.toList());
		hit.score = score;
		
		return score;
	}

	private String bigWords(String text) {
		if (text == null) {
			return null;
		}
		var kw = text.split("\\s+");
		var uKw = new ArrayList<>(Arrays.stream(kw).map(w -> w.replaceAll("[^\\p{L}]", "")).filter(w -> w.length() > 2).collect(Collectors.toSet()));
		// uKw.sort((a, b) -> -Integer.compare(a.length(), b.length()));
		LOG.trace("Unique keywords for _text: {} = {}", text, uKw);
		return uKw.stream().collect(Collectors.joining(" "));
	}
}