UsdaTaxonomyUpdater.java

/*
 * Copyright 2020 Global Crop Diversity Trust
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.genesys.server.service.worker;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileTime;
import java.time.Instant;
import java.time.ZoneOffset;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;
import java.util.stream.Collectors;

import org.apache.commons.io.FileUtils;
import org.genesys.server.model.genesys.QTaxonomy2;
import org.genesys.server.model.grin.QTaxonomyCommonName;
import org.genesys.server.model.grin.QTaxonomyFamily;
import org.genesys.server.model.grin.QTaxonomyGenus;
import org.genesys.server.model.grin.QTaxonomySpecies;
import org.genesys.server.model.grin.TaxonomyCommonName;
import org.genesys.server.model.grin.TaxonomyFamily;
import org.genesys.server.model.grin.TaxonomyGenus;
import org.genesys.server.model.grin.TaxonomySpecies;
import org.genesys.server.persistence.Taxonomy2Repository;
import org.genesys.server.persistence.grin.TaxonomyCommonNameRepository;
import org.genesys.server.persistence.grin.TaxonomyFamilyRepository;
import org.genesys.server.persistence.grin.TaxonomyGenusRepository;
import org.genesys.server.persistence.grin.TaxonomySpeciesRepository;
import org.genesys.spring.TransactionHelper;
import org.genesys.taxonomy.download.TaxonomyDownloader;
import org.genesys.taxonomy.gringlobal.component.CabReader;
import org.genesys.taxonomy.gringlobal.model.CommonNameRow;
import org.genesys.taxonomy.gringlobal.model.FamilyRow;
import org.genesys.taxonomy.gringlobal.model.GenusRow;
import org.genesys.taxonomy.gringlobal.model.SpeciesRow;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import org.springframework.security.access.prepost.PreAuthorize;
import org.springframework.stereotype.Component;

import com.google.common.collect.Lists;
import com.opencsv.CSVReader;
import com.querydsl.jpa.impl.JPAQueryFactory;

import lombok.extern.slf4j.Slf4j;

import javax.persistence.EntityManager;
import javax.persistence.PersistenceContext;

/**
 * The component downloads current GRIN Taxonomy database if no local copy
 * exists and updates Family, Genus and Species tables in the local database.
 * The matching is done on entity IDs of USDA GRIN Taxonomy and it overrides
 * local data.
 *
 * @author Matija Obreza
 */
@Component
@Slf4j
public class UsdaTaxonomyUpdater implements InitializingBean {

	@Autowired
	private TaxonomyGenusRepository taxonomyGenusRepository;
	@Autowired
	private TaxonomySpeciesRepository taxonomySpeciesRepository;
	@Autowired
	private TaxonomyCommonNameRepository taxonomyCommonNameRepository;
	@Autowired
	private TaxonomyFamilyRepository taxonomyFamilyRepository;
	@Autowired
	private Taxonomy2Repository taxonomy2Repository;

	@Autowired
	private ThreadPoolTaskExecutor taskExecutor;

	@Autowired
	private JPAQueryFactory jpaQueryFactory;

	@PersistenceContext
	private EntityManager entityManager;

	@Value("${data.dir}")
	private String rootDataDir;

	private File downloadFolder;

	@Override
	public void afterPropertiesSet() throws Exception {
		downloadFolder = new File(rootDataDir, "grin-taxonomy"); // + System.currentTimeMillis());
		log.warn("GRIN Taxonomy data folder: {}", downloadFolder.toPath().toAbsolutePath().toString());
	}

	/**
	 * Update local taxonomy tables with data from GRIN Taxonomy.
	 *
	 * @throws Exception
	 */
	@PreAuthorize("hasRole('ADMINISTRATOR')")
	// @Scheduled(initialDelayString = "P1D", fixedDelayString = "P7DT1H")
	// @SchedulerLock(name = "org.genesys.server.service.worker.UsdaTaxonomyUpdater")
	public void update() throws Exception {
		log.warn("Updating GRIN taxonomy database from folder {}", downloadFolder.getAbsolutePath());
		if (downloadDataIfNeeded(downloadFolder)) {
			updateLocalDatabase();
			log.warn("Taxonomy database updated successfully. Enjoy!");
		} else {
			log.warn("Taxonomy database is still recent. Enjoy!");
		}
	}

	/**
	 * The update starts with {@link TaxonomyFamily}, {@link TaxonomyGenus} and then
	 * {@link TaxonomySpecies}. The entries from source database are mapped to local
	 * identifiers. No records are removed from the local database.
	 * <p>
	 * Note: The update may update capitalization of names.
	 * </p>
	 *
	 * @throws Exception
	 */
	private void updateLocalDatabase() throws Exception {
		// read taxonomy_family.txt
		log.info("Loading {}/taxonomy_family.txt", downloadFolder);
		{
			List<FamilyRow> ggFamilies = new ArrayList<>();
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_family.txt")), 0)) {
				Iterator<FamilyRow> beanReader = CabReader.beanReader(FamilyRow.class, reader).iterator();

				FamilyRow familyRow = null;
				while (beanReader.hasNext() && (familyRow = beanReader.next()) != null) {
					ggFamilies.add(familyRow);
				}
			}

			final var missingFamiliesId = new HashSet<>(jpaQueryFactory.from(QTaxonomyFamily.taxonomyFamily).select(QTaxonomyFamily.taxonomyFamily.id).fetch());

			List<Future<List<TaxonomyFamily>>> futures = Lists.partition(ggFamilies, 1000).stream().map(batch -> taskExecutor.submit(() -> {
				return updateFamily(batch);
			})).collect(Collectors.toList());

			Map<Long, Long> familyCurrentMap = new HashMap<>();

			// Wait for all tasks to complete
			futures.forEach(f -> {
				try {
					List<TaxonomyFamily> result = f.get();

					var familyWithCurrent = result.stream()
						.filter(family -> family.getCurrentTaxonomyFamily() != null && family.getCurrentTaxonomyFamily().getId() != null).collect(Collectors.toList());

					familyWithCurrent.forEach(family -> {
						familyCurrentMap.put(family.getId(), family.getCurrentTaxonomyFamily().getId());
						family.setCurrentTaxonomyFamily(null);
					});

					TransactionHelper.executeInTransaction(false, () -> taxonomyFamilyRepository.saveAll(result));

					result.stream().map(TaxonomyFamily::getId).forEach(missingFamiliesId::remove);
					log.info("Updated {} families", result.size());
				} catch (Exception e) {
					log.error(e.getMessage(), e);
				}
			});

			if (!missingFamiliesId.isEmpty()) {
				log.warn("After refreshing grin_family {} records remained untouched: grin_family id={}", missingFamiliesId.size(), missingFamiliesId);
				try {
					taxonomyFamilyRepository.deleteAllByIdInBatch(missingFamiliesId);
				} catch (Throwable e) {
					log.warn("Could not delete untouched TaxonomyFamilies: {}", e.getMessage());
				}
			}

			var familyList = new LinkedList<>(familyCurrentMap.keySet());
			for (int i = 0; i < familyList.size(); i += 200) {
				int endIndex = Math.min(i + 200, familyList.size());
				var families = taxonomyFamilyRepository.findAllById(familyList.subList(i, endIndex));
				families.forEach(family -> family.setCurrentTaxonomyFamily(TaxonomyFamily.withId(familyCurrentMap.get(family.getId()))));
				taxonomyFamilyRepository.saveAll(families);
			}
		}

		// read taxonomy_genus.txt
		log.info("Loading taxonomy_genus.txt");
		{
			List<GenusRow> ggGens = new ArrayList<>();
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_genus.txt")), 0)) {
				Iterator<GenusRow> beanReader = CabReader.beanReader(GenusRow.class, reader).iterator();

				GenusRow genusRow = null;
				while (beanReader.hasNext() && (genusRow = beanReader.next()) != null) {
					ggGens.add(genusRow);
				}
			}

			final var missingGenusId = new HashSet<>(jpaQueryFactory.from(QTaxonomyGenus.taxonomyGenus).select(QTaxonomyGenus.taxonomyGenus.id).fetch());

			List<Future<List<TaxonomyGenus>>> futures = Lists.partition(ggGens, 1000).stream().map(batch -> taskExecutor.submit(() -> {
				return updateGenera(batch);
			})).collect(Collectors.toList());

			// Wait for all tasks to complete
			futures.forEach(f -> {
				try {
					List<TaxonomyGenus> result = f.get();
					TransactionHelper.executeInTransaction(false, () -> taxonomyGenusRepository.saveAll(result));

					missingGenusId.removeAll(result.stream().map(TaxonomyGenus::getId).collect(Collectors.toList()));
					log.info("Updated {} genera", result.size());
				} catch (Exception e) {
					log.error(e.getMessage(), e);
				}
			});

			if (!missingGenusId.isEmpty()) {
				log.warn("After refreshing grin_genus {} records remained untouched: grin_genus id={}", missingGenusId.size(), missingGenusId);
				try {
					taxonomyGenusRepository.deleteAllByIdInBatch(missingGenusId);
				} catch (Throwable e) {
					log.warn("Could not delete untouched TaxonomyGenus: {}", e.getMessage());
				}
			}
		}

		// read taxonomy_species.txt
		log.info("Loading taxonomy_species.txt");
		{
			List<SpeciesRow> ggSpes = new ArrayList<>();
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_species.txt")), 0)) {
				Iterator<SpeciesRow> beanReader = CabReader.beanReader(SpeciesRow.class, reader).iterator();

				SpeciesRow speciesRow = null;
				while (beanReader.hasNext() && (speciesRow = beanReader.next()) != null) {
					ggSpes.add(speciesRow);
				}
			}

			final var missingSpeciesId = new HashSet<>(jpaQueryFactory.from(QTaxonomySpecies.taxonomySpecies).select(QTaxonomySpecies.taxonomySpecies.id).fetch());

			List<Future<List<TaxonomySpecies>>> futures = Lists.partition(ggSpes, 1000).stream().map(batch -> taskExecutor.submit(() -> {
				return updateSpecies(batch);
			})).collect(Collectors.toList());

			// Wait for scheduled tasks to complete
			futures.forEach(f -> {
				try {
					List<TaxonomySpecies> result = f.get();
					TransactionHelper.executeInTransaction(false, () -> taxonomySpeciesRepository.saveAll(result));
					missingSpeciesId.removeAll(result.stream().map(TaxonomySpecies::getId).collect(Collectors.toList()));
					log.debug("Updated {} species", result.size());
				} catch (Exception e) {
					log.error("Execution failed {}", e.getMessage(), e);
				}
			});

			if (!missingSpeciesId.isEmpty()) {
				log.warn("After refreshing grin_species {} records remained untouched: grin_species id={}", missingSpeciesId.size(), missingSpeciesId);
				try {
					// Clear references
					TransactionHelper.executeInTransaction(false, () -> {
						var clearedCount = jpaQueryFactory.update(QTaxonomy2.taxonomy2).setNull(QTaxonomy2.taxonomy2.grinTaxonomySpecies()).where(QTaxonomy2.taxonomy2.grinTaxonomySpecies().id.in(missingSpeciesId)).execute();
						log.warn("Cleared {} Taxonomy2.grinTaxonomySpecies references", clearedCount);
						clearedCount = jpaQueryFactory.update(QTaxonomy2.taxonomy2).setNull(QTaxonomy2.taxonomy2.currentTaxonomySpecies()).where(QTaxonomy2.taxonomy2.currentTaxonomySpecies().id.in(missingSpeciesId)).execute();
						log.warn("Cleared {} Taxonomy2.currentTaxonomySpecies references", clearedCount);
						clearedCount = jpaQueryFactory.update(QTaxonomy2.taxonomy2).setNull(QTaxonomy2.taxonomy2.overrideTaxonomySpecies()).where(QTaxonomy2.taxonomy2.overrideTaxonomySpecies().id.in(missingSpeciesId)).execute();
						log.warn("Cleared {} Taxonomy2.overrideTaxonomySpecies references", clearedCount);

						// Delete obsolete GRIN Taxonomy records
						taxonomySpeciesRepository.deleteAllByIdInBatch(missingSpeciesId);
						return true;
					});
				} catch (Throwable e) {
					log.warn("Could not delete untouched TaxonomySpecies: {}", e.getMessage(), e);
				}
			}
		}

		{
			log.info("Loading taxonomy_common_name.txt");

			List<CommonNameRow> ggCommonNames = new ArrayList<>();
			try (CSVReader reader = CabReader.openCsvReader(new FileInputStream(new File(downloadFolder, "taxonomy_common_name.txt")), 0)) {
				Iterator<CommonNameRow> beanReader = CabReader.beanReader(CommonNameRow.class, reader).iterator();

				CommonNameRow commonNameRow = null;
				while (beanReader.hasNext() && (commonNameRow = beanReader.next()) != null) {
					ggCommonNames.add(commonNameRow);
				}
			}

			final var missingCommonNameId = new HashSet<>(jpaQueryFactory.from(QTaxonomyCommonName.taxonomyCommonName).select(QTaxonomyCommonName.taxonomyCommonName.id).fetch());

			List<Future<List<TaxonomyCommonName>>> futures = Lists.partition(ggCommonNames, 1000).stream().map(batch -> taskExecutor.submit(() -> {
				return updateCommonNames(batch);
			})).collect(Collectors.toList());

			// Wait for all tasks to complete
			futures.forEach(f -> {
				try {
					List<TaxonomyCommonName> result = f.get();
					TransactionHelper.executeInTransaction(false, () -> taxonomyCommonNameRepository.saveAll(result));
					missingCommonNameId.removeAll(result.stream().map(TaxonomyCommonName::getId).collect(Collectors.toList()));
					log.info("Updated {} taxonomy common names", result.size());
				} catch (Exception e) {
					log.error(e.getMessage(), e);
				}
			});

			if (!missingCommonNameId.isEmpty()) {
				log.warn("After refreshing grin_common_name {} records remained untouched: grin_common_name id={}", missingCommonNameId.size(), missingCommonNameId);
				try {
					taxonomyCommonNameRepository.deleteAllByIdInBatch(missingCommonNameId);
				} catch (Throwable e) {
					log.warn("Could not delete untouched TaxonomyCommonName: {}", e.getMessage());
				}
			}
		}
	}

	private List<TaxonomyFamily> updateFamily(List<FamilyRow> batch) {
		log.info("Processing {} families", batch.size());
		List<TaxonomyFamily> toSave = new ArrayList<>(batch.size());

		for (FamilyRow familyRow : batch) {
			TaxonomyFamily family = new TaxonomyFamily();
			family.setGrinId(familyRow.getTaxonomyFamilyId());
			family.setId(familyRow.getTaxonomyFamilyId());
			family.setFamilyName(familyRow.getFamilyName());
			family.setFamilyAuthority(familyRow.getFamilyAuthority());
			family.setSubfamilyName(familyRow.getSubfamilyName());
			family.setTribeName(familyRow.getTribeName());
			family.setSubtribeName(familyRow.getSubtribeName());
			family.setSuprafamilyRankCode(familyRow.getSuprafamilyRankCode());
			family.setSuprafamilyRankName(familyRow.getSuprafamilyRankName());
			family.setAlternateName(familyRow.getAlternateName());
			family.setFamilyTypeCode(familyRow.getFamilyTypeCode());
			family.setCurrentTaxonomyFamily(TaxonomyFamily.withId(familyRow.getCurrentTaxonomyFamilyId()));
			family.setNote(familyRow.getNote());

			// GG Audit
			family.setOwnedDate(familyRow.getOwnedDate().toInstant(ZoneOffset.UTC));
			family.setOwnedById(familyRow.getOwnedBy());
			family.setCreatedDate(familyRow.getCreatedDate().toInstant(ZoneOffset.UTC));
			family.setCreatedById(familyRow.getCreatedBy());
			if (familyRow.getModifiedDate() != null) {
				family.setModifiedDate(familyRow.getModifiedDate().toInstant(ZoneOffset.UTC));
			}
			family.setModifiedById(familyRow.getModifiedBy());

			toSave.add(family);
		}

		return toSave;
	}

	private List<TaxonomyCommonName> updateCommonNames(List<CommonNameRow> batch) {

		log.info("Processing {} common names", batch.size());
		List<TaxonomyCommonName> toSave = new ArrayList<>(batch.size());

		for (CommonNameRow commonNameRow : batch) {
			TaxonomyCommonName commonName = new TaxonomyCommonName(commonNameRow.getId());
			commonName.setTaxonomyGenus(TaxonomyGenus.withId(commonNameRow.getTaxonomyGenusId()));
			commonName.setTaxonomySpecies(TaxonomySpecies.withId(commonNameRow.getTaxonomySpeciesId()));
			commonName.setLanguageDescription(commonNameRow.getLanguageDescription());
			commonName.setAlternateTranscription(commonNameRow.getAlternateTranscription());
			commonName.setName(commonNameRow.getName());
			commonName.setSimplifiedName(commonNameRow.getSimplifiedName());
			commonName.setNote(commonNameRow.getNote());
			// commonNameRow.getCitationId();

			// GG Audit
			commonName.setOwnedDate(commonNameRow.getOwnedDate().toInstant(ZoneOffset.UTC));
			commonName.setOwnedById(commonNameRow.getOwnedBy());
			commonName.setCreatedDate(commonNameRow.getCreatedDate().toInstant(ZoneOffset.UTC));
			commonName.setCreatedById(commonNameRow.getCreatedBy());
			if (commonNameRow.getModifiedDate() != null) {
				commonName.setModifiedDate(commonNameRow.getModifiedDate().toInstant(ZoneOffset.UTC));
			}
			commonName.setModifiedById(commonNameRow.getModifiedBy());

			toSave.add(commonName);
		}

		return toSave;
	}

	private List<TaxonomySpecies> updateSpecies(List<SpeciesRow> batch) {
		log.info("Processing {} species", batch.size());
		List<TaxonomySpecies> toSave = new ArrayList<>(batch.size());

		for (SpeciesRow speciesRow : batch) {
			TaxonomySpecies species = new TaxonomySpecies(speciesRow.getSpeciesId());
			species.setCurrentTaxonomySpecies(TaxonomySpecies.withId(speciesRow.getCurrentTaxonomySpeciesId()));
			species.setTaxonomyGenus(new TaxonomyGenus(speciesRow.getGenusId()));
			species.setName(speciesRow.getName());
			species.setNameAuthority(speciesRow.getNameAuthority());

			species.setNomenNumber(speciesRow.getNomenNumber().intValue());
			species.setIsSpecificHybrid(speciesRow.getIsSpecificHybrid());
			species.setSpeciesName(speciesRow.getSpeciesName());
			species.setSpeciesAuthority(speciesRow.getSpeciesAuthority());
			species.setIsSubspecificHybrid(speciesRow.getIsSubspecificHybrid());
			species.setSubspeciesName(speciesRow.getSubspeciesName());
			species.setSubspeciesAuthority(speciesRow.getSubspeciesAuthority());
			species.setIsVarietalHybrid(speciesRow.getIsVarietalHybrid());
			species.setVarietyName(speciesRow.getVarietyName());
			species.setVarietyAuthority(speciesRow.getVarietyAuthority());
			species.setIsSubvarietalHybrid(speciesRow.getIsSubvarietalHybrid());
			species.setSubvarietyName(speciesRow.getSubvarietyName());
			species.setSubvarietyAuthority(speciesRow.getSubvarietyAuthority());
			species.setIsFormaHybrid(speciesRow.getIsFormaHybrid());
			species.setFormaRankType(speciesRow.getFormaRankType());
			species.setFormaName(speciesRow.getFormaName());
			species.setFormaAuthority(speciesRow.getFormaAuthority());
			// species.setPrioritySite1(speciesRow.getPrioritySite1());
			// species.setPrioritySite2(speciesRow.getPrioritySite2());
			// species.setCurator1Id(speciesRow.getCurator1Id());
			// species.setCurator2Id(speciesRow.getCurator2Id());
			species.setRestrictionCode(speciesRow.getRestrictionCode());
			species.setLifeFormCode(speciesRow.getLifeFormCode());
			species.setCommonFertilizationCode(speciesRow.getCommonFertilizationCode());
			species.setIsNamePending(speciesRow.getIsNamePending());
			species.setSynonymCode(speciesRow.getSynonymCode());
			// species.setVerifierCooperator(speciesRow.getVerifierId());
			if (speciesRow.getNameVerifiedDate() != null) {
				species.setNameVerifiedDate(speciesRow.getNameVerifiedDate().toInstant(ZoneOffset.UTC));
			}

			species.setProtologue(speciesRow.getProtologue());
			species.setProtologueVirtualPath(speciesRow.getProtologueVirtualPath());
			species.setNote(speciesRow.getNote());
			species.setSiteNote(speciesRow.getSiteNote());
			species.setAlternateName(speciesRow.getAlternateName());

			// GG Audit
			species.setOwnedDate(speciesRow.getOwnedDate().toInstant(ZoneOffset.UTC));
			species.setOwnedById(speciesRow.getOwnedBy());
			species.setCreatedDate(speciesRow.getCreatedDate().toInstant(ZoneOffset.UTC));
			species.setCreatedById(speciesRow.getCreatedBy());
			if (speciesRow.getModifiedDate() != null) {
				species.setModifiedDate(speciesRow.getModifiedDate().toInstant(ZoneOffset.UTC));
			}
			species.setModifiedById(speciesRow.getModifiedBy());

			toSave.add(species);
		}

		return toSave;
	}

	private List<TaxonomyGenus> updateGenera(List<GenusRow> batch) {
		log.info("Processing {} genera", batch.size());
		List<TaxonomyGenus> toSave = new ArrayList<>(batch.size());

		for (GenusRow genusRow : batch) {
			TaxonomyGenus genus = new TaxonomyGenus(genusRow.getGenusId());
			genus.setCurrentTaxonomyGenus(TaxonomyGenus.withId(genusRow.getCurrentTaxonomyGenusId()));

			genus.setGenusName(genusRow.getGenusName());
			genus.setGenusAuthority(genusRow.getGenusAuthority());
			genus.setSubgenusName(genusRow.getSubgenusName());
			genus.setSectionName(genusRow.getSectionName());
			genus.setSubsectionName(genusRow.getSubsectionName());
			genus.setSeriesName(genusRow.getSeriesName());
			genus.setSubseriesName(genusRow.getSubseriesName());
			genus.setTaxonomyFamily(new TaxonomyFamily(genusRow.getTaxonomyFamilyId()));

			genus.setGenusName(genusRow.getGenusName());
			genus.setGenusAuthority(genusRow.getGenusAuthority());
			genus.setSubgenusName(genusRow.getSubgenusName());
			genus.setSectionName(genusRow.getSectionName());
			genus.setSubsectionName(genusRow.getSubsectionName());
			genus.setSeriesName(genusRow.getSeriesName());
			genus.setSubseriesName(genusRow.getSubseriesName());

			genus.setQualifyingCode(genusRow.getQualifyingCode());
			genus.setHybridCode(genusRow.getHybridCode());
			genus.setNote(genusRow.getNote());

			// GG Audit
			genus.setOwnedDate(genusRow.getOwnedDate().toInstant(ZoneOffset.UTC));
			genus.setOwnedById(genusRow.getOwnedBy());
			genus.setCreatedDate(genusRow.getCreatedDate().toInstant(ZoneOffset.UTC));
			genus.setCreatedById(genusRow.getCreatedBy());
			if (genusRow.getModifiedDate() != null) {
				genus.setModifiedDate(genusRow.getModifiedDate().toInstant(ZoneOffset.UTC));
			}
			genus.setModifiedById(genusRow.getModifiedBy());

			toSave.add(genus);
		}
		return toSave;
	}

	static boolean downloadDataIfNeeded(File folder) throws IOException {
		final File dataFolder = folder;
		if (!dataFolder.exists()) {
			log.warn("Making directory " + dataFolder.getAbsolutePath());
			if (!dataFolder.mkdirs()) {
				throw new IOException("Failed to create data folder at " + dataFolder.getAbsolutePath());
			}
		}

		// The two required files
		final File genusFile = new File(dataFolder, "taxonomy_genus.txt");
		final File speciesFile = new File(dataFolder, "taxonomy_species.txt");

		boolean needsDownload = (!genusFile.exists() || isTooOld(genusFile)) || (!speciesFile.exists() || isTooOld(speciesFile));

		if (needsDownload) {
			log.warn("Taxonomy data not provided or too old in {}, starting download", dataFolder.getAbsolutePath());
			final TaxonomyDownloader dl = new TaxonomyDownloader();

			log.warn("Downloading GRIN-Taxonomy database to {}", dataFolder.getAbsolutePath());
			final File downloadedCabFile = File.createTempFile("grin-", ".cab");
			dl.downloadCurrent(downloadedCabFile);

			TaxonomyDownloader.unpackCabinetFile(downloadedCabFile, dataFolder, false);
			if (downloadedCabFile.exists() && downloadedCabFile.canWrite()) {
				log.warn("Deleting downloaded file {}", downloadedCabFile.getAbsolutePath());
				FileUtils.forceDelete(downloadedCabFile);
			}
			return true;
		}
		return false;
	}

	private static boolean isTooOld(File theFile) {
		try {
			BasicFileAttributes attr = Files.readAttributes(theFile.toPath(), BasicFileAttributes.class);
			FileTime fileTime = attr.creationTime();
			boolean isOld = fileTime.toInstant().isBefore(Instant.now().minus(5, ChronoUnit.DAYS));
			log.warn("{} created {} is old={}", theFile.getName(), fileTime.toInstant(), isOld);
			return isOld;
		} catch (IOException e) {
			log.warn("Could not determine age: {}", e.getMessage());
			return false;
		}
	}
}